In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
import nltk
from rake_nltk  import Rake


In [5]:
df = pd.read_csv("tvs_clean_df.csv")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,overview,number_of_episodes,number_of_seasons,vote_average,vote_count,popularity
0,0,1,Pride,The theme is strength and gallantry.Haru Saton...,11.0,1,8.192,13,6.77
1,1,2,Clerks,The continuing adventures of store clerks Dant...,6.0,1,6.897,78,10.62
2,2,3,The Message,The Message was a surreal comedy series which ...,6.0,1,8.5,2,0.707
3,3,4,The Amazing Mrs Pritchard,Supermarket manager Ros Pritchard decides to s...,6.0,1,7.3,3,3.452
4,4,5,La Job,La Job is a French Canadian comedy television ...,12.0,1,0.72,166,5.942


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85422 entries, 0 to 85421
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          85422 non-null  int64  
 1   id                  85422 non-null  int64  
 2   name                85422 non-null  object 
 3   overview            85422 non-null  object 
 4   number_of_episodes  85422 non-null  float64
 5   number_of_seasons   85422 non-null  int64  
 6   vote_average        85422 non-null  float64
 7   vote_count          85422 non-null  int64  
 8   popularity          85422 non-null  float64
dtypes: float64(3), int64(4), object(2)
memory usage: 5.9+ MB


In [8]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rezag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rezag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
df['keywords'] = ""
for index, row in df.iterrows():
    overview = str(row['overview'])
    r = Rake()
    r.extract_keywords_from_text(overview)
    keywords_scores_dict = r.get_word_degrees()
    #df.at[index, 'keywords'] = list(keywords_scores_dict.keys())
    df.at[index, 'keywords'] = str(list(keywords_scores_dict.keys())).replace("'", '').replace("[", "").replace("]", "").replace(",", "")

df.drop(columns=["Unnamed: 0", "overview"], inplace=True)

In [10]:
df.head()

Unnamed: 0,id,name,number_of_episodes,number_of_seasons,vote_average,vote_count,popularity,keywords
0,1,Pride,11.0,1,8.192,13,6.77,theme strength gallantry haru satonaka captain...
1,2,Clerks,6.0,1,6.897,78,10.62,continuing adventures store clerks dante randa...
2,3,The Message,6.0,1,8.5,2,0.707,message surreal comedy series spoofs current p...
3,4,The Amazing Mrs Pritchard,6.0,1,7.3,3,3.452,supermarket manager ros pritchard decides stan...
4,5,La Job,12.0,1,0.72,166,5.942,la job french canadian comedy television serie...


In [11]:
# This block needs quite a lot of memory
count = CountVectorizer()
count_matrix = count.fit_transform(df["keywords"])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.04279605, 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.05976143, 0.        ,
        0.        ],
       ...,
       [0.04279605, 0.        , 0.05976143, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [12]:
indices = pd.Series(df["name"])
indices[:20]

0                                 Pride
1                                Clerks
2                           The Message
3             The Amazing Mrs Pritchard
4                                La Job
5     Strange Days at Blake Holsey High
6                                  Bugs
7                               Te Kaea
8                            Match Game
9                     All in Good Faith
10           Strictly Sex with Dr. Drew
11                         Han Wu Da Di
12           How do you like Wednesday?
13                      Shuriken School
14          Mister Rogers' Neighborhood
15                Sidewalks: Video Nite
16                Mighty Truck of Stuff
17                          W*A*L*T*E*R
18                   Planet of the Apes
19                             Wildboyz
Name: name, dtype: object

In [15]:
def recommendations(Title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == Title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        #recommended_movies.append(list(new_df.index)[i])
        recommended_movies.append(df['name'][i])
        
    return recommended_movies

In [16]:
recommendations('Planet of the Apes')

['Chen Gong',
 'A Voyage to the Future',
 'Return To The Planet Of The Apes',
 'Once a Thief',
 'Untitled Alien Series',
 'Transformers: Go!',
 '3 Body Problem',
 'Harper Valley P.T.A.',
 'Transistor Love Story',
 'Jan from the Other Star']

In [17]:
recommendations('Squid Game')

['The Last Shot',
 'Strange Hill High',
 '화이팅',
 'Phoenix Drop High',
 'WHAT / IF',
 'Thank You, 5',
 'Top Chef Amateurs',
 'Blockbusters',
 'Barmageddon',
 'Bride vs. Bride']