# Assignment

## Task-1

### Corpus

In [287]:
## SkLearn# Collection of string documents

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [288]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [289]:
# sklearn feature names, they are sorted in alphabetic order by default.
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [290]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [291]:
# shape of sklearn tfidf vectorizer output after applying transform method.
skl_output.shape

(4, 9)

In [292]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [293]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


### Your custom implementation

In [294]:
# Write your code here.
# Make sure its well documented and readble with appropriate comments.
# Compare your results with the above sklearn tfidf vectorizer
# You are not supposed to use any other library apart from the ones given below

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [295]:
def fit(data):# returns vocab{word:index} and idf{word:idf} dictionaries with words as keys
  unq_words = set() #set to store all unique words
  vocab = dict() #dict to store words and indexes
  idf = dict() # dict to store words and idf values


  if isinstance(data, list):# chesking if input data is list
    for doc in data:  # looping to find all the unique words and storing
      for word in doc.split():
        unq_words.add(word)
    unq_words = sorted(list(unq_words)) # list of sorted unique words

    for wrd in unq_words: # calculating idf for every word and storing in dict
      counts = 0
      for docs in data:
        if wrd in docs.split():
          counts += 1
      idf[wrd] = 1+math.log((len(data)+1)/(counts+1)) #calculating idf using sk_learn formula given
      
    
    vocab = {j:i for i,j in enumerate(unq_words)}  #dict of words and indexes from unq_words list


    return vocab,idf
  else:
    return "input must be a list of strings"

In [296]:
vocab , idf = fit(corpus)
print(vocab.keys()) #list of sorted unique words

dict_keys(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'])


In [297]:
idf.keys() # list of sorted unique words

dict_keys(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'])

In [298]:
#list of IDF values corresponding to unique words
idf.values() 

dict_values([1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0])

In [299]:
def trans(data, vocab,idf): # returns sparse matrix of tfidf values
  rows = []
  columns = []
  values = []
 
  if isinstance(data, (list,)): # cheskinng type of input is list
    for idx, row in enumerate(data): 
      word_freq = dict(Counter(row.split())) # it will return a dict type object where key is the word and values is its frequency, {word:frequency}
      for word, freq in word_freq.items():  # for each unique word in the review.                
                
                # we will check if its there in the vocabulary that we build in fit() function
                # dict.get() function will return the values, if the key doesn't exits it will return -1
                col_index = vocab.get(word,-1) # retreving the dimension number of a word
                
                # if the word exists
                if col_index !=-1:
                    # we are storing the index of the document
                    rows.append(idx)
                    # we are storing the dimensions of the word
                    columns.append(col_index)
                    # we are storing the frequency of the word
                    values.append(freq/len(row.split())*idf[word]) #storing tfidf value
    

    return csr_matrix((values, (rows,columns)), shape=(len(data),len(vocab)))
  else:
        print("you need to pass list of strings")

In [300]:
tf_idf = trans(corpus,vocab, idf) #returns a sparse matrix of tfidf

In [301]:
tf_idf.shape # shape of the tf_idf

(4, 9)

In [302]:
print(tf_idf[0])  # sparse matrix before normalization for first line in the corpus

  (0, 1)	0.24462871026284194
  (0, 2)	0.3021651247531982
  (0, 3)	0.2
  (0, 6)	0.2
  (0, 8)	0.2


In [303]:
norm_tfidf = normalize(tf_idf) # normalizing the sparse matrix

In [304]:
print(norm_tfidf[0]) # normalized tfidf sparse matrix for first line in corpus

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


In [305]:
print(skl_output[0]) # scikit learn output of first line in corpus

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


## Task-2

In [306]:
# Below is the code to load the cleaned_strings pickle file provided
# Here corpus is of list type

import pickle
with open('cleaned_strings', 'rb') as f:
    corpus_1 = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus_1))

Number of documents in corpus =  746


In [307]:
def fit_50(data):# returns np array with sorted unique words index and idf values 
  unq_words = set() #empty set for unique words in the data
  vocab = dict() #dict to store wrods and respective index
  idf = dict() #dict to store words and corresponding idf values


  if isinstance(data, list):

    # gettting all unique words
    for doc in data:
      for word in doc.split():
        unq_words.add(word)
    unq_words = sorted(list(unq_words))


    # idf for each word
    for wrd in unq_words:
      counts = 0
      for docs in data:
        if wrd in docs.split():
          counts += 1
      idf[wrd] = 1+math.log((len(data)+1)/(counts+1))

    #top 50 idfs and corresponding words as keys
    top_50_idf = {k: v for k,v in sorted(idf.items(),reverse =True, key = lambda tup : tup[1])[0:50]}

    # vocab dict for top_50 idf words and their index as values {word:index}
    vocab = {k:ind for ind,k in enumerate(list(top_50_idf.keys()))}


    return vocab,top_50_idf
  else:
    return "input must be a list of strings"

In [308]:
vocab , idf = fit_50(corpus_1) # dicts with vocab and idf values of top 50 idf valued words

In [309]:
vocab

{'aailiyah': 0,
 'abandoned': 1,
 'abroad': 2,
 'abstruse': 3,
 'academy': 4,
 'accents': 5,
 'accessible': 6,
 'acclaimed': 7,
 'accolades': 8,
 'accurate': 9,
 'accurately': 10,
 'achille': 11,
 'ackerman': 12,
 'actions': 13,
 'adams': 14,
 'add': 15,
 'added': 16,
 'admins': 17,
 'admiration': 18,
 'admitted': 19,
 'adrift': 20,
 'adventure': 21,
 'aesthetically': 22,
 'affected': 23,
 'affleck': 24,
 'afternoon': 25,
 'aged': 26,
 'ages': 27,
 'agree': 28,
 'agreed': 29,
 'aimless': 30,
 'aired': 31,
 'akasha': 32,
 'akin': 33,
 'alert': 34,
 'alike': 35,
 'allison': 36,
 'allow': 37,
 'allowing': 38,
 'alongside': 39,
 'amateurish': 40,
 'amaze': 41,
 'amazed': 42,
 'amazingly': 43,
 'amusing': 44,
 'amust': 45,
 'anatomist': 46,
 'angel': 47,
 'angela': 48,
 'angelina': 49}

In [310]:
idf

{'aailiyah': 6.922918004572872,
 'abandoned': 6.922918004572872,
 'abroad': 6.922918004572872,
 'abstruse': 6.922918004572872,
 'academy': 6.922918004572872,
 'accents': 6.922918004572872,
 'accessible': 6.922918004572872,
 'acclaimed': 6.922918004572872,
 'accolades': 6.922918004572872,
 'accurate': 6.922918004572872,
 'accurately': 6.922918004572872,
 'achille': 6.922918004572872,
 'ackerman': 6.922918004572872,
 'actions': 6.922918004572872,
 'adams': 6.922918004572872,
 'add': 6.922918004572872,
 'added': 6.922918004572872,
 'admins': 6.922918004572872,
 'admiration': 6.922918004572872,
 'admitted': 6.922918004572872,
 'adrift': 6.922918004572872,
 'adventure': 6.922918004572872,
 'aesthetically': 6.922918004572872,
 'affected': 6.922918004572872,
 'affleck': 6.922918004572872,
 'afternoon': 6.922918004572872,
 'aged': 6.922918004572872,
 'ages': 6.922918004572872,
 'agree': 6.922918004572872,
 'agreed': 6.922918004572872,
 'aimless': 6.922918004572872,
 'aired': 6.922918004572872,

In [311]:
tf_idf_1 = trans(corpus_1, vocab,idf) # returns sparse matrix of coulmn size 50

In [312]:
print(tf_idf_1[0]) # sparse vector of first doc in corpus before normalization

  (0, 30)	0.865364750571609


In [313]:
corpus_1[0] #first doc in corpus

'slow moving aimless movie distressed drifting young man'

It has only one word "aimless" that is present in vocab

In [314]:
norm_tfidf = normalize(tf_idf_1) # normalizing the sparse matrix

In [315]:
print(norm_tfidf[0].toarray()) # prints norm_sparse veector of first doc with 50 features

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0.]]


<p><center><b>End of the Document