<a href="https://colab.research.google.com/github/poojashah19/Data-Science/blob/main/Assignment%205/TFIDF_vectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task-1

In [None]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from pandas import DataFrame

def document_matrix(list, vectorizer):
    doc_matrix = vectorizer.fit_transform(list)
    return DataFrame(doc_matrix.toarray(), columns = vectorizer.get_feature_names())

count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

In [None]:
## Prints the number of words appear in a particular document
count_output = document_matrix(corpus, count_vectorizer)
print(count_output)

   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         2      0   1    0       1    1      0     1
2    1         0      0   1    1       0    1      1     1
3    0         1      1   1    0       0    1      0     1


In [None]:
## Prints the tfidf value of words in a particular document
tfidf_output = document_matrix(corpus, tfidf_vectorizer)
print(tfidf_output)

        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


In [None]:
print(tfidf_vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
print(tfidf_vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
tfidf_output.shape

(4, 9)

In [None]:
skl_output = tfidf_vectorizer.transform(corpus)
print(skl_output)

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 5)	0.5386476208856763
  (1, 3)	0.281088674033753
  (1, 1)	0.6876235979836938
  (2, 8)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 0)	0.511848512707169
  (3, 8)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 2)	0.5802858236844359
  (3, 1)	0.46979138557992045


In [None]:
print(skl_output[3])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [None]:
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


In [None]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy as np

In [None]:
def get_unique_words(data):
  unique_words = set()

  if isinstance(data, (list,)):
    for row in data:
      for word in row.split(' '):
        if(len(word) < 2):
          continue
        unique_words.add(word)
    
    unique_words = sorted(list(unique_words))
    return unique_words
  else:
    print('pass list of sentences')


def get_vocab(unique_words):
  vocab = {j:i for i,j in enumerate(unique_words)}
  return vocab


def transform(corpus, vocab):
  rows = []
  columns = []
  values = []

  if isinstance(corpus, (list,)):
    for index, row in enumerate(tqdm(corpus)):
      word_freq = dict(Counter(row.split()))
      for word, freq in word_freq.items():
        if len(word) < 2:
          continue
        
        col_index = vocab.get(word, -1)
        if col_index != -1:
          rows.append(index)
          columns.append(col_index)
          values.append(freq)
    return csr_matrix((values, (rows, columns)), shape = (len(corpus), len(vocab)))
  else:
    print('pass a list of strings')


def get_freq(corpus, unique_words):
  flattened = [val for sublist in corpus for val in sublist.split(' ')]
  freq = {}
  for word in unique_words:
    freq[word] = flattened.count(word)
  return freq


def find_in_str(str, word):
  str_list = str.split(' ')
  for i in range(len(str_list)):
    if(word == str_list[i]):
      return True
  return False


def compute_tfidf(corpus, unique_words, transform_output):
  rows = []
  columns = []
  tf = []
  idf = []
  values = []

  for i in range(len(corpus)):
    count = 0
    
    for j in range(len(unique_words)):
        temp = transform_output[i][j]
        if(temp > 0):
           count += temp
    for j in range(len(unique_words)):
        temp = transform_output[i][j]
        if(temp > 0):
            tf_value = temp / count
            idf_value = math.log( (len(corpus) + 1)/( float(get_idf(corpus, unique_words[j] ) + 1)) ) + 1
            rows.append(i)
            columns.append(j)
            values.append(tf_value * idf_value)
  return csr_matrix((values, (rows, columns)), shape = (len(corpus), len(unique_words)))


def get_idf(corpus, word):
  count = 0
  for j in range(len(corpus)):
    if(find_in_str(corpus[j], word)):
      count += 1
  return count


In [None]:
unique_words = get_unique_words(corpus)
vocab = get_vocab(unique_words)
frequency_of_words = get_freq(corpus, unique_words)
sparse_matrix = transform(corpus, vocab)
transform_output = transform(corpus, vocab).toarray()
print("\n")
# print(unique_words)
# print(vocab)
# print(frequency_of_words)
# print(sparse_matrix)
# print(transform_output)
tf_idf = compute_tfidf(corpus, unique_words, transform_output)
print(normalize(tf_idf, norm = 'l2'), 6)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<?, ?it/s]



  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149 6





In [None]:
tfidf_output = document_matrix(corpus, tfidf_vectorizer)
print(tfidf_output)

        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


## Observation:
*   The list of unique words and their frequencies in entire document is same as the one calculated using TfidfVectorizer
*   The transform matrix that contains unique words is same as get_feature_names of scikit learn TfidfVectorizer
*   The transform output matrix is same as count_output that we calculated previously using scikit learn CountVectorizer.
*   Shape of the transform matrix matches with the one calculated using TfidfVectorizer transform method.
*   IDF values of all unique words from the entire document matches the values that were counted using scikit learn TfidfVectorizer.
*   IDF_ values calculated by multiplying TF*IDF values individually, matches with the values calculated using TfidfVectorizer._idf




# Task-2

In [None]:
import pickle
with open('cleaned_strings', 'rb') as f:
  corpus = pickle.load(f)

print("Number of documents in corpus = ",len(corpus))

tfidf_output = document_matrix(corpus, tfidf_vectorizer)
data = tfidf_output[-500:]
print(data)
print( tfidf_output[(tfidf_output.iloc[:, 78] != 0)] ) 

Number of documents in corpus =  746
     aailiyah  abandoned  ability  abroad  ...  yun  zillion  zombie  zombiez
246       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
247       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
248       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
249       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
250       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
..        ...        ...      ...     ...  ...  ...      ...     ...      ...
741       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
742       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
743       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
744       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
745       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0

[500 rows x 2886 columns]


In [None]:
def compute_topidf(corpus):
    vocab = []
    idf = {}
    top50_idf = {}
    
    for col in corpus.columns:
      idf_value = math.log( (len(corpus) + 1)/( len( corpus[(corpus[col] > 0)] ) + 1 ) ) + 1
      idf[col] = idf_value
    top50_idf = { k:v for k, v in sorted( idf.items(), key = lambda item: item[1], reverse=True )[:200] }
    return top50_idf


def compute_tfidf(corpus, top50_vocab):
  rows = []
  columns = []
  tf = []
  idf = []
  values = []

  for i in range(len(corpus)):
    count = 0
    
    for j in range(len(corpus.columns)):
      colname = corpus.columns[j]
      if(colname in top50_vocab):
        temp = corpus.iloc[i,j]
        if(temp > 0):
           count += temp
    for j in range(len(corpus.columns)):
      colname = corpus.columns[j]
      if(colname in top50_vocab):
        temp = corpus.iloc[i,j]
        if(temp != 0):
            tf_value = temp / count
            idf_value = math.log( (len(corpus) + 1)/( len( corpus[(corpus.iloc[:, j] > 0)] ) + 1 ) ) + 1
            rows.append(i)
            columns.append(j)
            print(tf_value, idf_value)
            values.append(tf_value * idf_value)
  return csr_matrix((values, (rows, columns)), shape = (len(corpus), len(corpus.columns)))

In [None]:
unique_words = data.columns
vocab = get_vocab(unique_words)
# frequency_of_words = get_freq(corpus, unique_words)
# sparse_matrix = transform(corpus, vocab)
# transform_output = transform(corpus, vocab).toarray()
top50_vocab = compute_topidf(data)
print(top50_vocab)

tf_idf = compute_tfidf(data, top50_vocab)
print(tf_idf)
print(normalize(tf_idf, norm = 'l2'))
# print(frequency_of_words)
# print(sparse_matrix)
# print(transform_output)
# tf_idf = compute_tfidf(corpus, unique_words, transform_output)
# print(normalize(tf_idf, norm = 'l2'), 6)

{'abroad': 7.2166061010848646, 'abstruse': 7.2166061010848646, 'accurately': 7.2166061010848646, 'actions': 7.2166061010848646, 'admins': 7.2166061010848646, 'admiration': 7.2166061010848646, 'admitted': 7.2166061010848646, 'adrift': 7.2166061010848646, 'adventure': 7.2166061010848646, 'affected': 7.2166061010848646, 'affleck': 7.2166061010848646, 'agreed': 7.2166061010848646, 'aimless': 7.2166061010848646, 'aired': 7.2166061010848646, 'akin': 7.2166061010848646, 'allison': 7.2166061010848646, 'amateurish': 7.2166061010848646, 'amaze': 7.2166061010848646, 'amusing': 7.2166061010848646, 'angel': 7.2166061010848646, 'angelina': 7.2166061010848646, 'angles': 7.2166061010848646, 'angry': 7.2166061010848646, 'animals': 7.2166061010848646, 'anthony': 7.2166061010848646, 'appealing': 7.2166061010848646, 'appears': 7.2166061010848646, 'applause': 7.2166061010848646, 'apt': 7.2166061010848646, 'array': 7.2166061010848646, 'articulated': 7.2166061010848646, 'artiness': 7.2166061010848646, 'artis

In [None]:
vectorizer = TfidfVectorizer(max_features=50)
vectorizer.fit(data)
new_output = vectorizer.transform(data)
print(vectorizer.idf_)
print(vectorizer.vocabulary_)

[8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826
 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826
 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826
 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826
 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826
 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826 8.274826
 8.274826 8.274826]
{'aailiyah': 0, 'portrayed': 1, 'portraying': 2, 'positive': 3, 'possible': 4, 'possibly': 5, 'post': 6, 'potted': 7, 'power': 8, 'powerful': 9, 'powerhouse': 10, 'practical': 11, 'practically': 12, 'pray': 13, 'precisely': 14, 'predict': 15, 'predictable': 16, 'predictably': 17, 'prejudice': 18, 'prelude': 19, 'premise': 20, 'prepared': 21, 'presence': 22, 'presents': 23, 'preservation': 24, 'president': 25, 'pretentious': 26, 'pretext': 27, 'pretty': 28, 'previous': 29, 'primal': 30, 'primary': 31, 'probably': 32, 'problem': 33, 'prob

In [None]:
tfidf_output = document_matrix(data, tfidf_vectorizer)
print(tfidf_output)

      aailiyah  abandoned  ability  abroad  ...  yun  zillion  zombie  zombiez
0          1.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
1          0.0        1.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
2          0.0        0.0      1.0     0.0  ...  0.0      0.0     0.0      0.0
3          0.0        0.0      0.0     1.0  ...  0.0      0.0     0.0      0.0
4          0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
...        ...        ...      ...     ...  ...  ...      ...     ...      ...
2881       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      0.0
2882       0.0        0.0      0.0     0.0  ...  1.0      0.0     0.0      0.0
2883       0.0        0.0      0.0     0.0  ...  0.0      1.0     0.0      0.0
2884       0.0        0.0      0.0     0.0  ...  0.0      0.0     1.0      0.0
2885       0.0        0.0      0.0     0.0  ...  0.0      0.0     0.0      1.0

[2886 rows x 2886 columns]
