In [10]:
import pandas as pd
import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
df = pd.read_csv('data/preprocessed_data.csv')

In [12]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,original_title,overview,genre_ids,genre_names,description_corrected,description_tokenized,description_corrected_stemming,description_corrected_lemmatize
0,0,0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]","Drama, Crime",imprisoned 1940s double murder wife lo...,imprisoned 1940s double murder wife lo...,imprison 1940 doubl murder wife lover upstand ...,imprisoned 1940s double murder wife lover upst...
1,1,1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]","Drama, Crime",spanning years 1945 1955 chronicle fictio...,spanning years 1945 1955 chronicle fictio...,span year 1945 1955 chronicl fiction italianam...,spanning year 1945 1955 chronicle fictional it...
2,2,2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]","Drama, Crime",continuing saga corleone crime family you...,continuing saga corleone crime family you...,continu saga corleon crime famili young vito c...,continuing saga corleone crime family young vi...
3,3,3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]","Drama, History, War",true story businessman oskar schindler save...,true story businessman oskar schindler save...,true stori businessman oskar schindler save th...,true story businessman oskar schindler saved t...
4,4,4,12 Angry Men,The defense and the prosecution have rested an...,[18],Drama,defense prosecution rested jury filing ...,defense prosecution rested jury filing ...,defens prosecut rest juri file juri room decid...,defense prosecution rested jury filing jury ro...
...,...,...,...,...,...,...,...,...,...,...
9995,9995,9995,The Last Airbender,"The story follows the adventures of Aang, a yo...","[28, 12, 14]","Action, Adventure, Fantasy",story follows adventures aang young succes...,story follows adventures aang young succes...,stori follow adventur aang young successor lon...,story follows adventure aang young successor l...
9996,9996,9996,From Dusk Till Dawn 2: Texas Blood Money,A bank-robbing gang of misfits heads to Mexico...,"[80, 28, 27, 53]","Crime, Action, Horror, Thriller",bankrobbing gang misfits heads mexico blu...,bankrobbing gang misfits heads mexico blu...,bankrob gang misfit head mexico blueprint perf...,bankrobbing gang misfit head mexico blueprint ...
9997,9997,9997,Cage Dive,Three friends from California are filming an a...,"[27, 18, 53]","Horror, Drama, Thriller",three friends california filming audition t...,three friends california filming audition t...,three friend california film audit tape extrem...,three friend california filming audition tape ...
9998,9998,9998,Street Fighter,Col. Guile and various other martial arts hero...,"[28, 12, 35, 53]","Action, Adventure, Comedy, Thriller",col guile various martial arts heroes fight ...,col guile various martial arts heroes fight ...,col guil variou martial art hero fight tyranni...,col guile various martial art hero fight tyran...


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Unnamed: 0.1                     10000 non-null  int64 
 1   Unnamed: 0                       10000 non-null  int64 
 2   original_title                   10000 non-null  object
 3   overview                         9999 non-null   object
 4   genre_ids                        10000 non-null  object
 5   genre_names                      9997 non-null   object
 6   description_corrected            9999 non-null   object
 7   description_tokenized            9999 non-null   object
 8   description_corrected_stemming   9999 non-null   object
 9   description_corrected_lemmatize  9999 non-null   object
dtypes: int64(2), object(8)
memory usage: 781.4+ KB


## Cleaning Corpus

In [15]:
nlp = spacy.load('en_core_web_sm')


df['description_tokenized'] = df['description_corrected'].astype(str).apply(nlp)

df['description_tokenized']

0       (imprisoned,   , 1940s,   , double, murder,   ...
1       (spanning,  , years, 1945,  , 1955,  , chronic...
2       (  , continuing, saga,   , corleone, crime, fa...
3       ( , true, story,   , businessman, oskar, schin...
4       ( , defense,   , prosecution,  , rested,   , j...
                              ...                        
9995    ( , story, follows,  , adventures,  , aang,  ,...
9996    ( , bankrobbing, gang,  , misfits, heads,  , m...
9997    (three, friends,  , california,  , filming,  ,...
9998    (col, guile,  , various,  , martial, arts, her...
9999    ( , unexpected, pregnancy, takes,  , terrifyin...
Name: description_tokenized, Length: 10000, dtype: object

In [16]:
def cleaned_corpus(doc):
    return [token.text for token in doc if token.text.strip() != '']

df['description_corrected'] = df['description_tokenized'].apply(cleaned_corpus)

df['description_corrected']

0       [imprisoned, 1940s, double, murder, wife, love...
1       [spanning, years, 1945, 1955, chronicle, ficti...
2       [continuing, saga, corleone, crime, family, yo...
3       [true, story, businessman, oskar, schindler, s...
4       [defense, prosecution, rested, jury, filing, j...
                              ...                        
9995    [story, follows, adventures, aang, young, succ...
9996    [bankrobbing, gang, misfits, heads, mexico, bl...
9997    [three, friends, california, filming, audition...
9998    [col, guile, various, martial, arts, heroes, f...
9999    [unexpected, pregnancy, takes, terrifying, tur...
Name: description_corrected, Length: 10000, dtype: object

### How many words are there in the entire corpus?

In [17]:
word_count = df['description_corrected'].apply(lambda x: len(str(x).split())).sum()

word_count

264981

### Total number of unique words (vocabulary)?

In [18]:
unique_word_count = df['description_corrected'].apply(lambda x: len(str(x).split())).unique().sum()

unique_word_count

4421

## Bag of Words

In [19]:
df['description_corpus'] = df['description_corrected'].apply(
    lambda tokens: " ".join(tokens) if isinstance(tokens, list) else ""
)

In [20]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['description_corpus'])

In [21]:
df['description_corpus']

0       imprisoned 1940s double murder wife lover upst...
1       spanning years 1945 1955 chronicle fictional i...
2       continuing saga corleone crime family young vi...
3       true story businessman oskar schindler saved t...
4       defense prosecution rested jury filing jury ro...
                              ...                        
9995    story follows adventures aang young successor ...
9996    bankrobbing gang misfits heads mexico blueprin...
9997    three friends california filming audition tape...
9998    col guile various martial arts heroes fight ty...
9999    unexpected pregnancy takes terrifying turn new...
Name: description_corpus, Length: 10000, dtype: object

In [22]:
# Displaying the Vocabulary and Frequency Counts
print(list(vectorizer.vocabulary_)[:50])

['imprisoned', '1940s', 'double', 'murder', 'wife', 'lover', 'upstanding', 'banker', 'andy', 'dufresne', 'begins', 'new', 'life', 'shawshank', 'prison', 'puts', 'accounting', 'skills', 'work', 'amoral', 'warden', 'long', 'stretch', 'comes', 'admired', 'inmates', 'including', 'older', 'prisoner', 'named', 'red', 'integrity', 'unquenchable', 'sense', 'hope', 'spanning', 'years', '1945', '1955', 'chronicle', 'fictional', 'italianamerican', 'corleone', 'crime', 'family', 'organized', 'patriarch', 'vito', 'barely', 'survives']


In [23]:
print("Feature Names:", vectorizer.get_feature_names_out())

Feature Names: ['00' '006' '007' ... 'éric' 'öztürk' 'ʻohana']


In [24]:
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

bow_df

Unnamed: 0,00,006,007,0s,10,100,1000,10000,100000,10000000,...,ángel,ángela,æon,édouard,émigré,émile,émilie,éric,öztürk,ʻohana
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
import numpy as np
# Analyzing Word Frequencies
word_counts = np.sum(X.toarray(), axis=0)

In [26]:
word_freq = dict(zip(vectorizer.get_feature_names_out(), word_counts))
print("Word Frequencies: ")
word_freq

Word Frequencies: 


{'00': 2,
 '006': 1,
 '007': 5,
 '0s': 1,
 '10': 32,
 '100': 13,
 '1000': 6,
 '10000': 5,
 '100000': 2,
 '10000000': 1,
 '1000foot': 1,
 '1000page': 1,
 '1000th': 1,
 '1001': 1,
 '100acrewood': 1,
 '100foot': 1,
 '100th': 3,
 '101': 2,
 '10191': 1,
 '101yearold': 1,
 '1021': 1,
 '103': 1,
 '108yearold': 1,
 '10day': 1,
 '10million': 1,
 '10round': 1,
 '10s': 1,
 '10th': 4,
 '10year': 3,
 '10yearold': 16,
 '11': 9,
 '111': 3,
 '1114pm': 1,
 '1123': 1,
 '1138': 1,
 '117': 4,
 '118': 2,
 '118th': 1,
 '119': 1,
 '11th': 3,
 '11year': 2,
 '11yearold': 16,
 '11yearolds': 1,
 '12': 26,
 '120': 2,
 '1200': 3,
 '1215': 1,
 '1250': 1,
 '125000': 1,
 '12foot': 1,
 '12hour': 2,
 '12person': 1,
 '12step': 1,
 '12th': 5,
 '12thcentury': 1,
 '12year': 1,
 '12yearold': 23,
 '13': 23,
 '130': 1,
 '130000': 1,
 '1331': 1,
 '1347': 1,
 '13th': 2,
 '13week': 1,
 '13yearold': 14,
 '14': 14,
 '140': 1,
 '1408': 1,
 '1415': 1,
 '142': 1,
 '1429': 1,
 '1492': 2,
 '14th': 4,
 '14thcentury': 1,
 '14year': 1,
 '

## Bag of Bi-gram And Bag of Tri-gram

In [27]:
cv = CountVectorizer(ngram_range=(2,3))

X = cv.fit_transform(df['description_corpus'])

In [28]:
# vocab
print(list(cv.vocabulary_.items())[:20])

[('imprisoned 1940s', 196850), ('1940s double', 1242), ('double murder', 109766), ('murder wife', 264237), ('wife lover', 433097), ('lover upstanding', 238113), ('upstanding banker', 417521), ('banker andy', 32110), ('andy dufresne', 17741), ('dufresne begins', 113126), ('begins new', 37891), ('new life', 272842), ('life shawshank', 228410), ('shawshank prison', 352043), ('prison puts', 308166), ('puts accounting', 313626), ('accounting skills', 5729), ('skills work', 358262), ('work amoral', 438177), ('amoral warden', 16896)]


In [29]:
# vocab
print(list(cv.vocabulary_.items())[-20:])

[('guile various martial', 174840), ('various martial arts', 419786), ('martial arts heroes', 247267), ('arts heroes fight', 24255), ('heroes fight tyranny', 183564), ('fight tyranny dictator', 142397), ('tyranny dictator bison', 411503), ('dictator bison cohorts', 103169), ('unexpected pregnancy', 413905), ('pregnancy takes', 305683), ('turn newlyweds', 408431), ('newlyweds zach', 274057), ('zach samantha', 447051), ('samantha mccall', 338390), ('unexpected pregnancy takes', 413906), ('pregnancy takes terrifying', 305684), ('terrifying turn newlyweds', 391379), ('turn newlyweds zach', 408432), ('newlyweds zach samantha', 274058), ('zach samantha mccall', 447052)]


In [30]:
len(cv.vocabulary_.items())

447685

## TF-IDF

In [31]:
tfidf = TfidfVectorizer()
tfidf.fit_transform(df['description_corpus']).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
print(tfidf.idf_) # idf is fixed for each doc

[9.11182808 9.51729319 8.4186809  ... 9.51729319 9.51729319 9.11182808]


In [33]:
print(tfidf.get_feature_names_out())

['00' '006' '007' ... 'éric' 'öztürk' 'ʻohana']


In [34]:
# vocab
print(list(tfidf.vocabulary_.items())[:20])

[('imprisoned', 13628), ('1940s', 205), ('double', 8137), ('murder', 18274), ('wife', 29806), ('lover', 16278), ('upstanding', 28796), ('banker', 2540), ('andy', 1492), ('dufresne', 8393), ('begins', 2867), ('new', 18691), ('life', 15870), ('shawshank', 24521), ('prison', 21276), ('puts', 21662), ('accounting', 722), ('skills', 24989), ('work', 30042), ('amoral', 1414)]
