In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
text = ['The quick brown fox jumped over the lazy dog.', 'The dog', 'The fox']

In [3]:
vectorizer = TfidfVectorizer()

In [5]:
vectorizer.fit(text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [6]:
print(vectorizer.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


In [7]:
print(vectorizer.idf_)

[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]


In [8]:
print(dict(zip(vectorizer.get_feature_names(),vectorizer.idf_)))

{'brown': 1.6931471805599454, 'dog': 1.2876820724517808, 'fox': 1.2876820724517808, 'jumped': 1.6931471805599454, 'lazy': 1.6931471805599454, 'over': 1.6931471805599454, 'quick': 1.6931471805599454, 'the': 1.0}


In [9]:
rr = dict(zip(vectorizer.get_feature_names(),vectorizer.idf_))
rr

{'brown': 1.6931471805599454,
 'dog': 1.2876820724517808,
 'fox': 1.2876820724517808,
 'jumped': 1.6931471805599454,
 'lazy': 1.6931471805599454,
 'over': 1.6931471805599454,
 'quick': 1.6931471805599454,
 'the': 1.0}

In [11]:
token_weight = pd.DataFrame(rr.items(), columns = ['word','weight'])
token_weight

Unnamed: 0,word,weight
0,brown,1.693147
1,dog,1.287682
2,fox,1.287682
3,jumped,1.693147
4,lazy,1.693147
5,over,1.693147
6,quick,1.693147
7,the,1.0


In [12]:
token_weight = token_weight.sort_values(by='weight', ascending=False)
token_weight

Unnamed: 0,word,weight
0,brown,1.693147
3,jumped,1.693147
4,lazy,1.693147
5,over,1.693147
6,quick,1.693147
1,dog,1.287682
2,fox,1.287682
7,the,1.0


In [14]:
vector = vectorizer.transform([text[0]])

In [15]:
print(vector.shape)
print(vector.toarray())

(1, 8)
[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


In [16]:
new1 = vectorizer.transform(text)
new1.toarray()

array([[0.36388646, 0.27674503, 0.27674503, 0.36388646, 0.36388646,
        0.36388646, 0.36388646, 0.42983441],
       [0.        , 0.78980693, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.61335554],
       [0.        , 0.        , 0.78980693, 0.        , 0.        ,
        0.        , 0.        , 0.61335554]])

In [17]:
max_val1=new1.max(axis=0).toarray().ravel()
max_val1

array([0.36388646, 0.78980693, 0.78980693, 0.36388646, 0.36388646,
       0.36388646, 0.36388646, 0.61335554])

In [18]:
sort_by_tfidf=max_val1.argsort()
sort_by_tfidf

array([0, 3, 4, 5, 6, 7, 1, 2])

In [19]:
feature_names = np.array(vectorizer.get_feature_names())

In [20]:
print('lowest tfidf:]\n{}'.format(feature_names[sort_by_tfidf[3:]]))
print('highest tfidf:]\n{}'.format(feature_names[sort_by_tfidf[-3:]]))

lowest tfidf:]
['over' 'quick' 'the' 'dog' 'fox']
highest tfidf:]
['the' 'dog' 'fox']


In [21]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/movies_metadata1.csv')
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [22]:
overview = data['overview']
overview.head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [23]:
overview.isnull().any()

True

In [24]:
overview.shape

(45466,)

In [25]:
from sklearn.impute import SimpleImputer

In [26]:
imputer = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value='')

In [27]:
overview1 = imputer.fit_transform(np.array(overview.values).reshape(-1,1))
overview1

array([["Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."],
       ["When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."],
       ["A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interest

In [28]:
pd.isnull(overview1).any()

False

In [29]:
overview1 = overview1[:,0]
overview1.shape

(45466,)

In [30]:
word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf = True,
    strip_accents="unicode",
    analyzer='word',
    token_pattern=r'\w{2,}',
    ngram_range=(1,1),
    max_features=30000
)

In [32]:
word_vectorizer.fit(overview1)
word_features = word_vectorizer.transform(overview1)

In [33]:
print(word_features.shape)

(45466, 30000)


In [34]:
print(word_vectorizer.vocabulary_)
print(len(word_vectorizer.vocabulary_))

30000


In [35]:
print(word_vectorizer.idf_)
print(len(word_vectorizer.idf_))

[ 9.15979271  5.97534908  9.23983542 ... 10.62612978  9.9329826
 10.33844771]
30000


In [39]:
wd = dict(zip(word_vectorizer.get_feature_names(),word_vectorizer.idf_))
wd

{'00': 9.159792709613278,
 '000': 5.97534908116656,
 '007': 9.239835417286814,
 '01': 9.7788319180195,
 '10': 6.007714365668592,
 '100': 6.9456185739632845,
 '1000': 8.834370309178649,
 '1001': 10.338447705954923,
 '100th': 9.932982597846758,
 '101': 9.326846794276443,
 '108': 9.932982597846758,
 '10th': 8.780303087908372,
 '11': 6.446627407844296,
 '11th': 8.834370309178649,
 '12': 6.145012241088591,
 '120': 8.952153344835033,
 '125': 9.527517489738594,
 '12th': 8.589247851145664,
 '13': 6.687789464661185,
 '130': 9.932982597846758,
 '13th': 8.39253755689961,
 '14': 6.741135445366477,
 '14th': 8.780303087908372,
 '15': 6.195312979563391,
 '150': 8.39253755689961,
 '1500': 10.115304154640713,
 '15th': 8.546688236726869,
 '16': 6.6007780876715545,
 '160': 10.115304154640713,
 '16mm': 8.952153344835033,
 '16th': 7.773498348493386,
 '17': 6.509806309465828,
 '1700s': 10.115304154640713,
 '175': 10.115304154640713,
 '17th': 7.647204623169094,
 '18': 6.482995052015172,
 '1800': 8.9521533448

In [40]:
token_weight1 = pd.DataFrame(wd.items(), columns = ['word','weight'])
token_weight1

Unnamed: 0,word,weight
0,00,9.159793
1,000,5.975349
2,007,9.239835
3,01,9.778832
4,10,6.007714
...,...,...
29995,не,9.422157
29996,но,9.778832
29997,он,10.626130
29998,по,9.932983


In [41]:
vector1 = word_vectorizer.transform([overview1[0]])

In [42]:
new2 = word_vectorizer.transform(overview1)
new2.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
max_val2=new2.max(axis=0).toarray().ravel()
max_val2

array([0.3512821 , 0.45009702, 0.35054216, ..., 0.52491707, 0.5091935 ,
       0.5872349 ])

In [44]:
sort_by_tfidf1=max_val2.argsort()
sort_by_tfidf1

array([15218,  1573,  2305, ...,  2114, 29985, 19393])

In [45]:
feature_names1 = np.array(word_vectorizer.get_feature_names())

In [46]:
print('lowest tfidf:]\n{}'.format(feature_names1[sort_by_tfidf1[10:]]))
print('highest tfidf:]\n{}'.format(feature_names1[sort_by_tfidf1[-10:]]))

lowest tfidf:]
['circulating' 'claymation' 'mok' ... 'available' 'τους' 'overview']
highest tfidf:]
['juho' 'film' 'directed' 'uuno' 'beast' 'released' 'documentary'
 'available' 'τους' 'overview']
