In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk

### Merging all the datasets

In [14]:
df1 = pd.read_csv("preprocessing/data_2018.csv")
df2 = pd.read_csv("preprocessing/data_2019.csv")
df3 = pd.read_csv("preprocessing/data_2020.csv")
df4 = pd.read_csv("preprocessing/data_2021.csv")
df5 = pd.read_csv("preprocessing/data_2022.csv")
df6 = pd.read_csv("preprocessing/data_2023.csv")
df7 = pd.read_csv("preprocessing/data.csv")

In [4]:
df1.head()

Unnamed: 0,movie_id,title,tags
0,406563,Insidious: The Last Key,Parapsychologist Elise Rainier and her team tr...
1,426258,The Strange Ones,Mysterious events surround the travels of two ...
2,468210,Sweet Country,"In 1929, an Australian Aboriginal stockman kil..."
3,399035,The Commuter,"A businessman, on his daily commute home, gets..."
4,442064,Proud Mary,Mary is a hit woman working for an organized c...


In [5]:
df2.head()

Unnamed: 0,movie_id,title,tags
0,522681,Escape Room,Six strangers find themselves in circumstances...
1,561362,Rust Creek,When an overachieving college senior makes a w...
2,567738,American Hangman,An unidentified man posts a live feed on socia...
3,508763,A Dog's Way Home,"The adventure of Bella, a dog who embarks on a..."
4,440472,The Upside,Phillip is a wealthy quadriplegic who needs a ...


In [6]:
df3.head()

Unnamed: 0,movie_id,title,tags
0,443791,Underwater,After an earthquake destroys their underwater ...
1,526019,Like a Boss,Two female friends with very different ideals ...
2,527534,The Murder of Nicole Brown Simpson,In 1994 Nicole Brown Simpson and her friend Ro...
3,662844,Angels Fallen,After the tragic loss of his wife battling the...
4,38700,Bad Boys for Life,Marcus and Mike are forced to confront new thr...


In [7]:
df4.head()

Unnamed: 0,movie_id,title,tags
0,628534,The White Tiger,An ambitious Indian driver uses his wit and cu...
1,741228,Locked Down,"During a COVID-19 lockdown, sparring couple Li..."
2,532865,The Dig,"As WWII looms, a wealthy widow hires an amateu..."
3,775996,Outside the Wire,"In the near future, a drone pilot is sent into..."
4,634528,The Marksman,Jim Hanson’s quiet life is suddenly disturbed ...


In [15]:
df7.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [16]:
data = pd.concat([df7, df1, df2, df3, df4, df5, df6])

In [17]:
data.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [18]:
len(data)

7131

In [19]:
data.to_csv("main_data.csv", index = False)

In [23]:
data['tags'][ : 4]

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
Name: tags, dtype: object

### Converting all the tags to lower case

In [24]:
data['tags'] = data['tags'].apply(lambda x : x.lower())

In [25]:
data.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


### Performing Stemming on the dataset 
    - Standardize words to their root form running, ran, runs -> run

In [26]:
from nltk.stem.porter import PorterStemmer

In [27]:
ps = PorterStemmer()

In [28]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [29]:
data['tags'] = data['tags'].apply(stem)

In [31]:
data.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


### To find similarity between two words making use of "Bag of Words" and Cosine Similarity

In [32]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [38]:
vectors = cv.fit_transform(data['tags']).toarray()

In [41]:
vectors.shape

(7131, 5000)

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
similarity = cosine_similarity(vectors)

In [53]:
def recommend(movie):
    movie_index = data[data['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x : x[1])[1 : 6]
    for i in movies_list:
        print(data.iloc[i[0]].title)

In [54]:
recommend("Avatar")

Aliens vs Predator: Requiem
Independence Day
Falcon Rising
Titan A.E.
Battle: Los Angeles


In [56]:
recommend("Wrong Turn")

The Turning
Book of Shadows: Blair Witch 2
Brahms: The Boy II
The Frozen
Devil


In [57]:
np.save('cosine_similarity.npy', similarity)