In [1]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

# Task - 1:

## Step - 1: Fit Method

In [2]:

def fit(dataset):

    unique_words = set()

    for row in corpus:
        strings = row.split()
    
        for word in strings:
            unique_words.add(word)

    sorted_list_words = sorted(list(unique_words))

    vocabulary = {value:index for index, value in enumerate(sorted_list_words)}
    
    return vocabulary, sorted_list_words


In [3]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [4]:
vocabulary, sorted_list_words = fit(corpus)
print(vocabulary)

{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}


## Step - 2: Calculating IDF values 

In [5]:

N = len(corpus)
idf_values = []

for key, value in vocabulary.items():
    count = 0
    for row in corpus:
        if key in row:
            count = count + 1
    idf = 1 + math.log((1+N)/(1+count))
    idf_values.append(idf)


print(idf_values)              

[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [6]:
word_idf_values = {key:value for key, value in zip(sorted_list_words,idf_values)}

In [7]:
word_idf_values

{'and': 1.916290731874155,
 'document': 1.2231435513142097,
 'first': 1.5108256237659907,
 'is': 1.0,
 'one': 1.916290731874155,
 'second': 1.916290731874155,
 'the': 1.0,
 'third': 1.916290731874155,
 'this': 1.0}

## Step - 3: Transform Method

In [8]:

def transform(dataset, vocabulary):

    columns = []
    rows = []
    values = []

    for index, row in enumerate(corpus):
    
        list_of_words_in_row = row.split()
        number_of_words_in_row = len(list_of_words_in_row)
        
        word_freq = dict(Counter(row.split()))
        
        for word, idf_value in word_idf_values.items():
            for word1, dimension in vocabulary.items():
                for word2, frequency in word_freq.items():
                    if word1 == word2 and word1 == word:
                        tf_idf = (frequency/number_of_words_in_row)*idf_value
                        columns.append(dimension) 
                        rows.append(index)
                        values.append(tf_idf) 
                    else:
                        continue
                    
  
    sparse_matrix = csr_matrix((values, (rows,columns)), shape=(len(corpus),len(vocabulary)))

    output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    
    return output
    


In [9]:
final_output = transform(corpus, vocabulary)

In [10]:
final_output[0]

<1x9 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [11]:
print(final_output[0])

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


In [12]:
print(final_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


###############################################################################

# Task - 2:

In [13]:

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ", len(corpus))

Number of documents in corpus =  746


## Step - 1: Fit Method

In [14]:
import operator

def fit(dataset):
    unique_words = set()

    for row in corpus:
        strings = row.split()
    
        for word in strings:
            unique_words.add(word)

    sorted_list_words = sorted(list(unique_words))

    vocabulary = {value:index for index, value in enumerate(sorted_list_words)}
    
    N = len(corpus)
    
    idf_values = []

    for key, value in vocabulary.items():
        count = 0
        for row in corpus:
            if key in row:
                count = count + 1
        idf = 1 + math.log((1+N)/(1+count))
        idf_values.append(idf)
        
    word_mapped_idf_values = {key:value for key, value in zip(sorted_list_words,idf_values)}
        
    sorted_d = dict(sorted(word_mapped_idf_values.items(), key=operator.itemgetter(1),reverse=True))

    top_50_word_idf_values = dict(list(sorted_d.items())[0:50])

    new_vocabulary = []

    for word1 in list(top_50_word_idf_values.keys()):
        for word2 in list(sorted_d.keys()):
            if word1 == word2:
                new_vocabulary.append(word2)
            
    new_vocab_dict = {word:index for index, word in enumerate(new_vocabulary)}
    
    return new_vocab_dict, new_vocabulary, top_50_word_idf_values

In [15]:
new_vocab, new_sorted_list_words, top_50_word_idf_values = fit(corpus)

In [16]:
new_vocab

{'aailiyah': 0,
 'abandoned': 1,
 'abroad': 2,
 'abstruse': 3,
 'academy': 4,
 'accents': 5,
 'accessible': 6,
 'acclaimed': 7,
 'accolades': 8,
 'accurately': 9,
 'achille': 10,
 'ackerman': 11,
 'adams': 12,
 'added': 13,
 'admins': 14,
 'admiration': 15,
 'admitted': 16,
 'adrift': 17,
 'adventure': 18,
 'aesthetically': 19,
 'affected': 20,
 'affleck': 21,
 'afternoon': 22,
 'agreed': 23,
 'aimless': 24,
 'aired': 25,
 'akasha': 26,
 'alert': 27,
 'alike': 28,
 'allison': 29,
 'allowing': 30,
 'alongside': 31,
 'amateurish': 32,
 'amazed': 33,
 'amazingly': 34,
 'amusing': 35,
 'amust': 36,
 'anatomist': 37,
 'angela': 38,
 'angelina': 39,
 'angry': 40,
 'anguish': 41,
 'angus': 42,
 'animals': 43,
 'animated': 44,
 'anita': 45,
 'anniversary': 46,
 'anthony': 47,
 'antithesis': 48,
 'anyway': 49}

In [17]:
top_50_word_idf_values

{'aailiyah': 6.922918004572872,
 'abandoned': 6.922918004572872,
 'abroad': 6.922918004572872,
 'abstruse': 6.922918004572872,
 'academy': 6.922918004572872,
 'accents': 6.922918004572872,
 'accessible': 6.922918004572872,
 'acclaimed': 6.922918004572872,
 'accolades': 6.922918004572872,
 'accurately': 6.922918004572872,
 'achille': 6.922918004572872,
 'ackerman': 6.922918004572872,
 'adams': 6.922918004572872,
 'added': 6.922918004572872,
 'admins': 6.922918004572872,
 'admiration': 6.922918004572872,
 'admitted': 6.922918004572872,
 'adrift': 6.922918004572872,
 'adventure': 6.922918004572872,
 'aesthetically': 6.922918004572872,
 'affected': 6.922918004572872,
 'affleck': 6.922918004572872,
 'afternoon': 6.922918004572872,
 'agreed': 6.922918004572872,
 'aimless': 6.922918004572872,
 'aired': 6.922918004572872,
 'akasha': 6.922918004572872,
 'alert': 6.922918004572872,
 'alike': 6.922918004572872,
 'allison': 6.922918004572872,
 'allowing': 6.922918004572872,
 'alongside': 6.9229180

## Step - 2: Transform Method

In [18]:

def transform(dataset, vocabulary):
    
    columns = []
    rows = []
    values = []

    for index, row in enumerate(corpus):
    
        list_of_words_in_row = row.split()
        number_of_words_in_row = len(list_of_words_in_row)
        
        word_freq = dict(Counter(row.split()))
  
        for word, idf_value in top_50_word_idf_values.items():
            for word1, dimension in new_vocab.items():
                for word2, frequency in word_freq.items():
                    if word1 == word2 and word1 == word:
                        tf_idf = (frequency/number_of_words_in_row)*idf_value
                        columns.append(dimension)  
                        rows.append(index)
                        values.append(tf_idf) 
                    
                    else:
                        continue                  
              
    sparse_matrix = csr_matrix((values, (rows,columns)), shape=(len(corpus),len(vocabulary)))
    
    top_50_output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    
    return sparse_matrix, top_50_output


In [19]:
sparse_matrix_output, top_50_final_output = transform(corpus, new_vocab)

In [20]:
print(sparse_matrix_output)

  (0, 24)	0.865364750571609
  (19, 43)	0.015769744885131828
  (68, 21)	0.38460655580960396
  (72, 23)	1.3845836009145744
  (74, 25)	0.9889882863675531
  (89, 47)	0.7692131116192079
  (135, 3)	0.009918220636923885
  (135, 9)	0.009918220636923885
  (135, 15)	0.009918220636923885
  (135, 17)	0.009918220636923885
  (135, 29)	0.009918220636923885
  (135, 32)	0.009918220636923885
  (135, 40)	0.009918220636923885
  (176, 39)	0.24724707159188827
  (192, 18)	2.307639334857624
  (193, 20)	0.3009964349814292
  (216, 2)	0.4615278669715248
  (225, 16)	0.7692131116192079
  (227, 14)	1.153819667428812
  (241, 35)	1.730729501143218
  (270, 1)	0.22331993563138297
  (290, 22)	0.865364750571609
  (341, 34)	0.9889882863675531
  (344, 33)	0.865364750571609
  (348, 8)	0.5325321541979132
  (409, 5)	2.307639334857624
  (430, 31)	0.865364750571609
  (457, 36)	1.3845836009145744
  (461, 4)	0.49449414318377655
  (461, 44)	0.49449414318377655
  (465, 30)	0.6293561822338974
  (475, 28)	0.38460655580960396
  (493, 

In [21]:
print(top_50_final_output) #normalised values between 0 and 1

  (0, 24)	1.0
  (19, 43)	1.0
  (68, 21)	1.0
  (72, 23)	1.0
  (74, 25)	1.0
  (89, 47)	1.0
  (135, 3)	0.37796447300922725
  (135, 9)	0.37796447300922725
  (135, 15)	0.37796447300922725
  (135, 17)	0.37796447300922725
  (135, 29)	0.37796447300922725
  (135, 32)	0.37796447300922725
  (135, 40)	0.37796447300922725
  (176, 39)	1.0
  (192, 18)	1.0
  (193, 20)	1.0
  (216, 2)	1.0
  (225, 16)	1.0
  (227, 14)	1.0
  (241, 35)	1.0
  (270, 1)	1.0
  (290, 22)	1.0
  (341, 34)	1.0
  (344, 33)	1.0
  (348, 8)	1.0
  (409, 5)	1.0
  (430, 31)	1.0
  (457, 36)	1.0
  (461, 4)	0.7071067811865475
  (461, 44)	0.7071067811865475
  (465, 30)	1.0
  (475, 28)	1.0
  (493, 6)	1.0
  (500, 38)	1.0
  (544, 41)	1.0
  (548, 0)	0.7071067811865475
  (548, 26)	0.7071067811865475
  (608, 12)	1.0
  (612, 10)	1.0
  (620, 37)	0.7071067811865476
  (620, 42)	0.7071067811865476
  (632, 7)	1.0
  (644, 11)	0.5773502691896257
  (644, 45)	0.5773502691896257
  (644, 46)	0.5773502691896257
  (667, 19)	1.0
  (691, 27)	1.0
  (699, 48)	1.0
  

In [22]:
top_50_final_output

<746x50 sparse matrix of type '<class 'numpy.float64'>'
	with 50 stored elements in Compressed Sparse Row format>

In [23]:
print(top_50_final_output[0].toarray())

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]


In [24]:
numpy.shape(top_50_final_output[0]) 

(1, 50)

In [25]:
print(top_50_final_output[461].toarray())

[[0.         0.         0.         0.         0.70710678 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.70710678 0.         0.         0.
  0.         0.        ]]


In [26]:
print(top_50_final_output[735].toarray())

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1.]]


###############################################################################