In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
file = open('Claude.txt','r')
claude = file.read()
file.close()

In [3]:
file = open("Ida.txt", 'r')
ida = file.read()
file.close()

In [4]:
file = open ('Youyou_Claire.txt', 'r')
yc = file.read()
file.close()

In [6]:
docs = [claude, ida, yc]

# First create CountVectorizer to count frequency of words

CV does the following: 
-lowercases your text (set lowercase=false if you don’t want lowercasing)
-uses utf-8 encoding
-performs tokenization (converts raw text to smaller units of text)
-uses word level tokenization (meaning each word is treated as a separate token)
-ignores single characters during tokenization (say goodbye to words like ‘a’ and ‘I’)

In [7]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform (docs)

In [9]:
word_count_vector.shape

(3, 1676)

See that the 3 rows refer to 3 docs in the initial list

# Compute IDF values

In [12]:
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

# make dataframe to see score of each word 

In [14]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(), columns=['idf_weights'])
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
these,1.000000
happen,1.000000
many,1.000000
happened,1.000000
go,1.000000
...,...
ha,1.693147
gym,1.693147
guys,1.693147
handle,1.693147


# Compute IDF score per doc

if using new set of docs on trained model, run cv.transform and tfidf_transformer.transform to create new vectors. 
Since using the same docs here just run tfidf_transformer.transform to get tfidf scores

In [16]:
tfidf_vector = tfidf_transformer.transform(word_count_vector)

In [17]:
first_doc_vectors = tfidf_vector[0]

In [23]:
first_doc_df = pd.DataFrame(first_doc_vectors.T.todense(), index = cv.get_feature_names(), columns=['tfidf scores'])
first_doc_df.sort_values(by=['tfidf scores'], ascending=False)

Unnamed: 0,tfidf scores
it,0.411463
of,0.381289
the,0.301740
you,0.263336
and,0.230419
...,...
helping,0.000000
helpful,0.000000
help,0.000000
hear,0.000000


# Use tfidfVectorizer which does it all in one go. Same function calls

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

In [26]:
tfidf_vec = tfidf_vectorizer.fit(docs) 

In [27]:
vectors = tfidf_vec.transform(docs)

In [30]:
vectors

<3x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 2493 stored elements in Compressed Sparse Row format>

In [31]:
test_df_doc1 = pd.DataFrame(vectors[0].T.todense(), index = tfidf_vec.get_feature_names(), columns = ['tfidf scores'])

In [32]:
test_df_doc1.sort_values(by=['tfidf scores'], ascending=False)

Unnamed: 0,tfidf scores
it,0.411463
of,0.381289
the,0.301740
you,0.263336
and,0.230419
...,...
helping,0.000000
helpful,0.000000
help,0.000000
hear,0.000000
