In [126]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
df = pd.read_csv("imdb_tvshows.csv")

In [7]:
df.head()

Unnamed: 0,Title,About,EpisodeDuration(in Minutes),Genres,Actors,Rating,Votes,Years
0,The Family Man,A working man from the National Investigation ...,42.0,"Action, Comedy, Drama","Manoj Bajpayee, Samantha Akkineni, Priyamani, ...",8.8,57632,2019–
1,Lucifer,Lucifer Morningstar has decided he's had enoug...,45.0,"Crime, Drama, Fantasy","Tom Ellis, Lauren German, Lesley-Ann Brandt, K...",8.1,252826,2016–
2,The Handmaid's Tale,"Set in a dystopian future, a woman is forced t...",42.0,"Drama, Sci-Fi, Thriller","Elisabeth Moss, Yvonne Strahovski, Joseph Fien...",8.4,187007,2017–
3,StartUp,"A desperate banker, a Haitian-American gang lo...",60.0,"Crime, Thriller","Adam Brody, Edi Gathegi, Otmara Marrero, Krist...",8.0,18165,2016–2018
4,Game of Thrones,Nine noble families fight for control over the...,44.0,"Action, Adventure, Drama","Emilia Clarke, Peter Dinklage, Kit Harington, ...",9.3,1823966,2011–2019


In [8]:
print(df.columns)

Index(['Title', 'About', 'EpisodeDuration(in Minutes)', 'Genres', 'Actors',
       'Rating', 'Votes', 'Years'],
      dtype='object')


In [155]:
#We only want to keep any featurs that we can use to build tags
#Keep Title, About, Genres, Actors
#Drop numerical feaures - so episode duration, rating, votes, and years
df_new = df.drop(columns = ['EpisodeDuration(in Minutes)', 'Votes', 'Years'])

In [156]:
df_new

Unnamed: 0,Title,About,Genres,Actors,Rating
0,The Family Man,A working man from the National Investigation ...,"Action, Comedy, Drama","Manoj Bajpayee, Samantha Akkineni, Priyamani, ...",8.8
1,Lucifer,Lucifer Morningstar has decided he's had enoug...,"Crime, Drama, Fantasy","Tom Ellis, Lauren German, Lesley-Ann Brandt, K...",8.1
2,The Handmaid's Tale,"Set in a dystopian future, a woman is forced t...","Drama, Sci-Fi, Thriller","Elisabeth Moss, Yvonne Strahovski, Joseph Fien...",8.4
3,StartUp,"A desperate banker, a Haitian-American gang lo...","Crime, Thriller","Adam Brody, Edi Gathegi, Otmara Marrero, Krist...",8.0
4,Game of Thrones,Nine noble families fight for control over the...,"Action, Adventure, Drama","Emilia Clarke, Peter Dinklage, Kit Harington, ...",9.3
...,...,...,...,...,...
2995,Panic,A successful defense lawyer at a boutique firm...,"Comedy, Drama","Christopher Cox, Olivia DiNino, Benjamin Hsieh...",9.3
2996,Ryan Hansen Solves Crimes on Television,A comedy centered around six 30-something frie...,"Action, Comedy, Crime","Ryan Hansen, Aly Michalka, Samira Wiley, Noell...",7.2
2997,Doubt,Romantic anthology web series revolving around...,Drama,"Katherine Heigl, Dulé Hill, Laverne Cox, Dream...",5.6
2998,Friends with Better Lives,,Comedy,"James Van Der Beek, Majandra Delfino, Zoe List...",7.0


In [144]:
#checking for empty cells per feature
df_new.isnull().sum()

Title      0
About     14
Genres     0
Actors     6
Rating     0
dtype: int64

In [145]:
#Since the number of empty cells is so low I shall drop these rows
df_new.dropna(inplace = True)

In [146]:
#Checking for duplicate rows
df_new.duplicated().sum()

0

In [147]:
df_new = df_new.reset_index().drop(columns = ['index'])

In [148]:
df_new

Unnamed: 0,Title,About,Genres,Actors,Rating
0,The Family Man,A working man from the National Investigation ...,"Action, Comedy, Drama","Manoj Bajpayee, Samantha Akkineni, Priyamani, ...",8.8
1,Lucifer,Lucifer Morningstar has decided he's had enoug...,"Crime, Drama, Fantasy","Tom Ellis, Lauren German, Lesley-Ann Brandt, K...",8.1
2,The Handmaid's Tale,"Set in a dystopian future, a woman is forced t...","Drama, Sci-Fi, Thriller","Elisabeth Moss, Yvonne Strahovski, Joseph Fien...",8.4
3,StartUp,"A desperate banker, a Haitian-American gang lo...","Crime, Thriller","Adam Brody, Edi Gathegi, Otmara Marrero, Krist...",8.0
4,Game of Thrones,Nine noble families fight for control over the...,"Action, Adventure, Drama","Emilia Clarke, Peter Dinklage, Kit Harington, ...",9.3
...,...,...,...,...,...
2975,The Beautiful Life: TBL,"Meet Sam, a lonely teenager struggling through...",Drama,"Mischa Barton, Benjamin Hollingsworth, Sara Pa...",5.7
2976,Star Trek: New Voyages,In a world where Hollywood actors can partner ...,"Action, Adventure, Sci-Fi","Charles Root, John M. Kelley, James Cawley, Je...",6.7
2977,Panic,A successful defense lawyer at a boutique firm...,"Comedy, Drama","Christopher Cox, Olivia DiNino, Benjamin Hsieh...",9.3
2978,Ryan Hansen Solves Crimes on Television,A comedy centered around six 30-something frie...,"Action, Comedy, Crime","Ryan Hansen, Aly Michalka, Samira Wiley, Noell...",7.2


## Creating tags

In [149]:
df_new['About'] = df_new['About'].apply(lambda x:x.split()).apply(lambda x: [i.replace(",", "") for i in x])
df_new['Genres'] = df_new['Genres'].apply(lambda x:x.split()).apply(lambda x: [i.replace(",", "") for i in x])
df_new['Actors'] = df_new['Actors'].apply(lambda x:x.split(",")).apply(lambda x: [i.replace(",", "") for i in x])

In [96]:
df_new

Unnamed: 0,Title,About,Genres,Actors
0,The Family Man,"[A, working, man, from, the, National, Investi...","[Action, Comedy, Drama]","[Manoj Bajpayee, Samantha Akkineni, Priyaman..."
1,Lucifer,"[Lucifer, Morningstar, has, decided, he's, had...","[Crime, Drama, Fantasy]","[Tom Ellis, Lauren German, Lesley-Ann Brandt..."
2,The Handmaid's Tale,"[Set, in, a, dystopian, future, a, woman, is, ...","[Drama, Sci-Fi, Thriller]","[Elisabeth Moss, Yvonne Strahovski, Joseph F..."
3,StartUp,"[A, desperate, banker, a, Haitian-American, ga...","[Crime, Thriller]","[Adam Brody, Edi Gathegi, Otmara Marrero, K..."
4,Game of Thrones,"[Nine, noble, families, fight, for, control, o...","[Action, Adventure, Drama]","[Emilia Clarke, Peter Dinklage, Kit Haringto..."
...,...,...,...,...
2975,The Beautiful Life: TBL,"[Meet, Sam, a, lonely, teenager, struggling, t...",[Drama],"[Mischa Barton, Benjamin Hollingsworth, Sara..."
2976,Star Trek: New Voyages,"[In, a, world, where, Hollywood, actors, can, ...","[Action, Adventure, Sci-Fi]","[Charles Root, John M. Kelley, James Cawley,..."
2977,Panic,"[A, successful, defense, lawyer, at, a, boutiq...","[Comedy, Drama]","[Christopher Cox, Olivia DiNino, Benjamin Hs..."
2978,Ryan Hansen Solves Crimes on Television,"[A, comedy, centered, around, six, 30-somethin...","[Action, Comedy, Crime]","[Ryan Hansen, Aly Michalka, Samira Wiley, N..."


In [97]:
df_new['About'] = df_new['About'].apply(lambda x:[i.replace(" ", "") for i in x])
df_new['Genres'] = df_new['Genres'].apply(lambda x:[i.replace(" ", "") for i in x])
df_new['Actors'] = df_new['Actors'].apply(lambda x:[i.replace(" ", "") for i in x])

In [98]:
df_new

Unnamed: 0,Title,About,Genres,Actors
0,The Family Man,"[A, working, man, from, the, National, Investi...","[Action, Comedy, Drama]","[ManojBajpayee, SamanthaAkkineni, Priyamani, S..."
1,Lucifer,"[Lucifer, Morningstar, has, decided, he's, had...","[Crime, Drama, Fantasy]","[TomEllis, LaurenGerman, Lesley-AnnBrandt, Kev..."
2,The Handmaid's Tale,"[Set, in, a, dystopian, future, a, woman, is, ...","[Drama, Sci-Fi, Thriller]","[ElisabethMoss, YvonneStrahovski, JosephFienne..."
3,StartUp,"[A, desperate, banker, a, Haitian-American, ga...","[Crime, Thriller]","[AdamBrody, EdiGathegi, OtmaraMarrero, Kristen..."
4,Game of Thrones,"[Nine, noble, families, fight, for, control, o...","[Action, Adventure, Drama]","[EmiliaClarke, PeterDinklage, KitHarington, Le..."
...,...,...,...,...
2975,The Beautiful Life: TBL,"[Meet, Sam, a, lonely, teenager, struggling, t...",[Drama],"[MischaBarton, BenjaminHollingsworth, SaraPaxt..."
2976,Star Trek: New Voyages,"[In, a, world, where, Hollywood, actors, can, ...","[Action, Adventure, Sci-Fi]","[CharlesRoot, JohnM.Kelley, JamesCawley, JeffM..."
2977,Panic,"[A, successful, defense, lawyer, at, a, boutiq...","[Comedy, Drama]","[ChristopherCox, OliviaDiNino, BenjaminHsieh, ..."
2978,Ryan Hansen Solves Crimes on Television,"[A, comedy, centered, around, six, 30-somethin...","[Action, Comedy, Crime]","[RyanHansen, AlyMichalka, SamiraWiley, NoelleE..."


In [99]:
df_new['tags'] = df_new['About'] + df_new['Genres'] + df_new['Actors']

In [100]:
df_new

Unnamed: 0,Title,About,Genres,Actors,tags
0,The Family Man,"[A, working, man, from, the, National, Investi...","[Action, Comedy, Drama]","[ManojBajpayee, SamanthaAkkineni, Priyamani, S...","[A, working, man, from, the, National, Investi..."
1,Lucifer,"[Lucifer, Morningstar, has, decided, he's, had...","[Crime, Drama, Fantasy]","[TomEllis, LaurenGerman, Lesley-AnnBrandt, Kev...","[Lucifer, Morningstar, has, decided, he's, had..."
2,The Handmaid's Tale,"[Set, in, a, dystopian, future, a, woman, is, ...","[Drama, Sci-Fi, Thriller]","[ElisabethMoss, YvonneStrahovski, JosephFienne...","[Set, in, a, dystopian, future, a, woman, is, ..."
3,StartUp,"[A, desperate, banker, a, Haitian-American, ga...","[Crime, Thriller]","[AdamBrody, EdiGathegi, OtmaraMarrero, Kristen...","[A, desperate, banker, a, Haitian-American, ga..."
4,Game of Thrones,"[Nine, noble, families, fight, for, control, o...","[Action, Adventure, Drama]","[EmiliaClarke, PeterDinklage, KitHarington, Le...","[Nine, noble, families, fight, for, control, o..."
...,...,...,...,...,...
2975,The Beautiful Life: TBL,"[Meet, Sam, a, lonely, teenager, struggling, t...",[Drama],"[MischaBarton, BenjaminHollingsworth, SaraPaxt...","[Meet, Sam, a, lonely, teenager, struggling, t..."
2976,Star Trek: New Voyages,"[In, a, world, where, Hollywood, actors, can, ...","[Action, Adventure, Sci-Fi]","[CharlesRoot, JohnM.Kelley, JamesCawley, JeffM...","[In, a, world, where, Hollywood, actors, can, ..."
2977,Panic,"[A, successful, defense, lawyer, at, a, boutiq...","[Comedy, Drama]","[ChristopherCox, OliviaDiNino, BenjaminHsieh, ...","[A, successful, defense, lawyer, at, a, boutiq..."
2978,Ryan Hansen Solves Crimes on Television,"[A, comedy, centered, around, six, 30-somethin...","[Action, Comedy, Crime]","[RyanHansen, AlyMichalka, SamiraWiley, NoelleE...","[A, comedy, centered, around, six, 30-somethin..."


In [101]:
df_final = df_new[['Title', 'tags']]

In [157]:
df_final['Rating'] = df_new['Rating']
df_final['About'] = df_new['About']
df_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Rating'] = df_new['Rating']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['About'] = df_new['About']


Unnamed: 0,Title,tags,Rating,About
0,The Family Man,a work man from the nation investig agenc tri ...,8.8,A working man from the National Investigation ...
1,Lucifer,lucif morningstar ha decid he' had enough of b...,8.1,Lucifer Morningstar has decided he's had enoug...
2,The Handmaid's Tale,set in a dystopian futur a woman is forc to li...,8.4,"Set in a dystopian future, a woman is forced t..."
3,StartUp,a desper banker a haitian-american gang lord a...,8.0,"A desperate banker, a Haitian-American gang lo..."
4,Game of Thrones,nine nobl famili fight for control over the la...,9.3,Nine noble families fight for control over the...
...,...,...,...,...
2975,The Beautiful Life: TBL,meet sam a lone teenag struggl through colleg ...,6.4,Ayvalik is brought into play by the mob from a...
2976,Star Trek: New Voyages,in a world where hollywood actor can partner u...,8.0,"The early life of Bobby McCallister, a progres..."
2977,Panic,a success defens lawyer at a boutiqu firm beco...,6.4,"A relationship-advice guru, upon learning that..."
2978,Ryan Hansen Solves Crimes on Television,a comedi center around six 30-someth friend wh...,7.9,"When a young couple inherit a farm, they are d..."


In [105]:
df_final['tags'] = df_final['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['tags'] = df_final['tags'].apply(lambda x: " ".join(x))


In [109]:
df_final['tags'] = df_final['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['tags'] = df_final['tags'].apply(lambda x: x.lower())


In [110]:
df_final.head()

Unnamed: 0,Title,tags
0,The Family Man,a working man from the national investigation ...
1,Lucifer,lucifer morningstar has decided he's had enoug...
2,The Handmaid's Tale,set in a dystopian future a woman is forced to...
3,StartUp,a desperate banker a haitian-american gang lor...
4,Game of Thrones,nine noble families fight for control over the...


## Using Vectorizer and Cosine similarity to find similairity between tv shows

In [123]:
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [116]:
#vectorizing our tags
vectors = cv.fit_transform(df_final['tags']).toarray()

In [117]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [118]:
#getting our most frequently used words
cv.get_feature_names()

['10',
 '100',
 '11',
 '12',
 '13',
 '13th',
 '14',
 '15',
 '16',
 '16th',
 '17',
 '17th',
 '18',
 '1800s',
 '1871',
 '19',
 '1900s',
 '1905',
 '1920s',
 '1940s',
 '1943',
 '1945',
 '1950s',
 '1960s',
 '1969',
 '1970s',
 '1980',
 '1980s',
 '1984',
 '1990',
 '1990s',
 '1996',
 '19th',
 '20',
 '2001',
 '2015',
 '2016',
 '20s',
 '20th',
 '21',
 '21st',
 '24',
 '25',
 '26',
 '28',
 '30',
 '300',
 '30s',
 '40',
 '40s',
 '50',
 '60',
 '800',
 '80s',
 '8th',
 '90',
 '90s',
 'aaron',
 'aarondismuke',
 'aaronpaul',
 'aaronpedersen',
 'aasifmandvi',
 'abandoned',
 'abbiecornish',
 'abby',
 'abbytrott',
 'abc',
 'abiding',
 'abigailspencer',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abolitionist',
 'abroad',
 'absence',
 'absurd',
 'abuse',
 'academy',
 'accept',
 'accepts',
 'accident',
 'accidentally',
 'acclaimed',
 'accompanied',
 'account',
 'accountant',
 'accounts',
 'accusations',
 'accused',
 'acero',
 'achieve',
 'acosta',
 'acquired',
 'acre',
 'act',
 'acting',
 'action',
 'acti

In [121]:
#applying stemming to get rid of words that are similar to one another and replace them with the same names
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [122]:
df_final['tags'] = df_final['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['tags'] = df_final['tags'].apply(stem)


In [124]:
#vectorizing our tags
vectors = cv.fit_transform(df_final['tags']).toarray()

In [125]:
#getting our most frequently used words
cv.get_feature_names()

['10',
 '100',
 '10000',
 '11',
 '12',
 '13',
 '13th',
 '14',
 '15',
 '16',
 '16th',
 '17',
 '17th',
 '18',
 '1800',
 '1871',
 '19',
 '1905',
 '1920',
 '1930',
 '1940s',
 '1943',
 '1945',
 '1950',
 '1950s',
 '1960',
 '1960s',
 '1969',
 '1970',
 '1970s',
 '1980',
 '1980s',
 '1984',
 '1990',
 '1990s',
 '1996',
 '19th',
 '20',
 '2000',
 '2001',
 '2005',
 '2016',
 '20th',
 '21st',
 '23',
 '24',
 '25',
 '26',
 '28',
 '30',
 '300',
 '40',
 '50',
 '60',
 '80',
 '800',
 '8th',
 '90',
 'aaron',
 'aarondismuk',
 'aaronpaul',
 'aaronpedersen',
 'aaronstanford',
 'aasifmandvi',
 'abandon',
 'abbi',
 'abbiecornish',
 'abbijacobson',
 'abbydonnelli',
 'abbytrott',
 'abc',
 'abduct',
 'abigailspenc',
 'abil',
 'abilities',
 'abl',
 'aboard',
 'abolitionist',
 'abov',
 'absenc',
 'absorb',
 'absurd',
 'abus',
 'academi',
 'academy',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'account',
 'accus',
 'acero',
 'achiev',
 'acquir',
 'act',
 'action',
 'activ',
 'activities',
 '

In [130]:
#using cosine similarity as a way to evaluate similarity and rank similar tv shows
similarity = cosine_similarity(vectors)

## Recommending top 5 shows in similairty

In [151]:
def recommend(show):
    show_idx = df_final[df_final['Title'] == show].index[0]
    distances = similarity[show_idx]
    shows_list  = sorted(list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:6]
    for i in shows_list:
        print(df_final['Title'][i[0]])

In [152]:
recommend('Nature Cat')

Bravest Warriors
Black Lagoon
PJ Masks
DuckTales
Mar de plástico


In [158]:
import pickle
pickle.dump(df_final, open('shows.pkl', 'wb'))

In [141]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))