In [128]:
corpus = [
    'this is the first document', 'this document is the second document',
    'and this is the third one', 'is this the first document'
]

In [129]:
from collections import Counter
import csv
import math
import ast
import numpy as np
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from scipy.sparse import csr_matrix

In [130]:
def uniqueWords(corpus):
    string = ' '.join(map(str, corpus))
    data_frame = string.split(' ')
    uniquewords = set(data_frame)
    return uniquewords

In [131]:
def numberOfWords(para, corpus):
    numWord = dict.fromkeys(uniqueWords(corpus), 0)
    para_list = para.split(' ')
    for word in para_list:

        numWord[word] += 1
    tup = sorted(numWord.items(), key=lambda kv: [kv[0], kv[1]])
    return dict(tup)

In [132]:
def writeCSV(fileName):
    myFile = open(fileName, 'w')
    with myFile:
        writer = csv.writer(myFile)


def updateCSV(fileName, data_list):
    with open(fileName, 'a') as fd:
        writer = csv.writer(fd)
        writer.writerow(data_list)

In [151]:
IDFfile = 'idf_csv.csv'


def computeIDF(corpus, isTopNIDF):
    writeCSV(IDFfile)
    N = len(corpus)
    idfDict = {}
    words = []
    for paragraph in corpus:
        para = paragraph.split(' ')

        set_para = set(para)

        for word in set_para:
            words.append(word)

    word_count = Counter(words)


    for word, val in word_count.items():

        idfDict[word] = 1 + (math.log((1 + N) / (1 + float(val))))

    if (isTopNIDF):
        dec_sorted = sorted(idfDict.items(),
                            key=lambda kv: [kv[1], kv[0]],
                            reverse=True)

        top_idfc = dec_sorted[0:50]

        idfc_val = [lis[1] for lis in top_idfc]

        idfc_words = [lis[0] for lis in top_idfc]

        topIDFC = dict(zip(idfc_words, idfc_val))

    else:
        topIDFC = idfDict

    updateCSV(IDFfile, [topIDFC])

In [152]:
TFfile = 'tf_csv.csv'


def computeTF(corpus, unique_set_words):
    writeCSV(TFfile)
    tfDict = {}
    for para in corpus:
        wordDict = numberOfWords(para, corpus)
        para_list = para.split(' ')
        para_len = len(para_list)
        for word, count in wordDict.items():

            if (word in unique_set_words):
                tfDict[word] = count / float(para_len)

        updateCSV(TFfile, [tfDict])

In [153]:
TFIDFfile = 'tfidf_csv.csv'


def computeTFIDF(tfFile, idFile):
    writeCSV(TFIDFfile)
    idf = {}
    tfidf = {}

    with open(idFile, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            if (len(row) != 0):
                val = row[0]
                idf = ast.literal_eval(val)
    print('unique words with idf score \n ')

    for key, value in idf.items():
        print(key, '\t\t', value)
    print('*' * 100)

    with open(TFfile, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            if (len(row) != 0):
                val = row[0]
                tf_val = ast.literal_eval(val)
                for word, val in tf_val.items():
                    if word in idf:

                        tfidf[word] = val * idf[word]
                updateCSV(TFIDFfile, [tfidf])

In [154]:
def transform(data_corpus, isTopNIDFC, uniqueSetWords):
    computeIDF(data_corpus, isTopNIDFC)
    computeTF(data_corpus, uniqueSetWords)
    computeTFIDF(TFfile, IDFfile)

In [155]:
def createMatrix(tfidf_file):
    a = []
    with open(tfidf_file, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            if (len(row) != 0):
                val = row[0]
                idf = ast.literal_eval(val)
                a.append(list(idf.values()))

    matrix = np.array(a)
    return matrix

In [156]:
def createCSRMatrix(tfidf_file):
    matrix = createMatrix(tfidf_file)

    noramlized_matrix = normalize(matrix, axis=1, norm='l2')

    csrMatrix = csr_matrix(noramlized_matrix)
    print('csr matrix')
    print(csrMatrix)
    print('*'*100)
    denseMatrix = csrMatrix.todense()
    return denseMatrix

In [157]:
def fit(tfidf_file):
    dense = createCSRMatrix(tfidf_file)
    print('dense matrix')
    print(dense)

In [158]:
transform(corpus, False, uniqueWords(corpus))
fit(TFIDFfile)


unique words with idf score 
 
is 		 1.0
this 		 1.0
document 		 1.2231435513142097
the 		 1.0
first 		 1.5108256237659907
second 		 1.916290731874155
third 		 1.916290731874155
and 		 1.916290731874155
one 		 1.916290731874155
****************************************************************************************************
csr matrix
  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149
********************************************************************************

In [159]:
import pickle
with open('cleaned_strings', 'rb') as f:
    file_corpus = pickle.load(f)
    

print("Number of documents in corpus = ",len(file_corpus))

Number of documents in corpus =  746


In [160]:
transform(file_corpus,True,uniqueWords(file_corpus))
fit(TFIDFfile)

unique words with idf score 
 
zombiez 		 6.922918004572872
zillion 		 6.922918004572872
z 		 6.922918004572872
yun 		 6.922918004572872
youtube 		 6.922918004572872
youthful 		 6.922918004572872
younger 		 6.922918004572872
yelps 		 6.922918004572872
yawn 		 6.922918004572872
yardley 		 6.922918004572872
x 		 6.922918004572872
wrote 		 6.922918004572872
writers 		 6.922918004572872
wrap 		 6.922918004572872
wow 		 6.922918004572872
woven 		 6.922918004572872
wouldnt 		 6.922918004572872
worthy 		 6.922918004572872
worthwhile 		 6.922918004572872
worthless 		 6.922918004572872
worry 		 6.922918004572872
worked 		 6.922918004572872
woo 		 6.922918004572872
wont 		 6.922918004572872
wong 		 6.922918004572872
wondered 		 6.922918004572872
woa 		 6.922918004572872
witticisms 		 6.922918004572872
within 		 6.922918004572872
wise 		 6.922918004572872
win 		 6.922918004572872
wily 		 6.922918004572872
willie 		 6.922918004572872
william 		 6.922918004572872
wild 		 6.922918004572872
wih 		 6.