In [10]:
import math
from collections import Counter

from tqdm import tqdm

from preprocessing import get_df, preprocess

In [5]:
def compute_tf(text):
    tf_text = Counter(text)

    for i in tf_text:
        tf_text[i] /= float(len(text))

    return tf_text

def compute_idf(word, corpus):
    weight = sum([1.0 for document in corpus if word in document])
    relative_weight = math.log10(len(corpus)/weight)
    return relative_weight

def compute_tfidf(corpus):
    documents_list = []

    for text in tqdm(corpus):
        tf_idf_dictionary = {}
        idf_dictionary = {}

        computed_tf = compute_tf(text)

        for word in computed_tf:
            if word not in idf_dictionary:
                idf_dictionary[word] = compute_idf(word, corpus)

            tf_idf_dictionary[word] = computed_tf[word] * idf_dictionary[word]

        documents_list.append(tf_idf_dictionary)

    return documents_list


In [6]:
df = get_df()
preprocessed = preprocess(df.text)

In [7]:
tfidf = compute_tfidf(preprocessed)

100%|██████████| 5572/5572 [03:09<00:00, 29.38it/s]


In [9]:
tfidf


[{'go': 0.06338275046576704,
  'until': 0.1102213020758542,
  'jurong': 0.17838148132152026,
  'point': 0.1287913534568429,
  'crazy': 0.12380395581303273,
  'available': 0.12104243452838101,
  'only': 0.06997853826565151,
  'in': 0.04034694886487773,
  'bugis': 0.1413266598746801,
  'n': 0.0786947344090329,
  'great': 0.08213437184199941,
  'world': 0.10607129370828754,
  'la': 0.13813871751131757,
  'e': 0.08881892930814926,
  'buffet': 0.16404671962323544,
  'cine': 0.13813871751131757,
  'there': 0.0735530468300702,
  'got': 0.06618977383613349,
  'amore': 0.17838148132152026,
  'wat': 0.08486777525744622,
  'nannannan': 0.01898481392547502},
 {'ok': 0.1920872238529117,
  'lar': 0.31636174152995294,
  'joking': 0.43529158620227243,
  'wif': 0.333005394254444,
  'u': 0.1189610294273582,
  'oni': 0.4669842647188947,
  'nannannan': 0.056954441776425065},
 {'free': 0.048264073562113353,
  'entry': 0.16443332908512642,
  'in': 0.029216756074566634,
  'a': 0.023309422108419683,
  'wkly':