# Netflix Movie Data



This dataset contains more than 8,500 Netflix movies and TV shows, including cast members, duration, and genre. It contains titles added as recently as late September 2021.

In [1]:
import pandas as pd 
netflix_data = pd.read_csv('data/netflix_dataset.csv', index_col=0)
netflix_data.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


[Source of dataset](https://www.kaggle.com/shivamb/netflix-shows).

In [2]:
netflix_data.columns

Index(['type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [2]:
netflix_data.shape

(8807, 11)

In [4]:
netflix_data.description[2]

'To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.'

In [2]:
netflix_data_copy = netflix_data.copy()
netflix_data_copy = netflix_data_copy[["title", "description"]]
netflix_data_copy.head()

Unnamed: 0_level_0,title,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1
s1,Dick Johnson Is Dead,"As her father nears the end of his life, filmm..."
s2,Blood & Water,"After crossing paths at a party, a Cape Town t..."
s3,Ganglands,To protect his family from a powerful drug lor...
s4,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo..."
s5,Kota Factory,In a city of coaching centers known to train I...


In [3]:
import re

def cleaning(s):
    s = str(s)
    s = s.lower()
    s = re.sub('[^a-zA-Z]', ' ', s)
    return s

In [4]:
netflix_data_copy['clean_desc'] = netflix_data_copy['description'].apply(cleaning)

In [5]:
netflix_data_copy.head()

Unnamed: 0_level_0,title,description,clean_desc
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
s1,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...",as her father nears the end of his life filmm...
s2,Blood & Water,"After crossing paths at a party, a Cape Town t...",after crossing paths at a party a cape town t...
s3,Ganglands,To protect his family from a powerful drug lor...,to protect his family from a powerful drug lor...
s4,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...",feuds flirtations and toilet talk go down amo...
s5,Kota Factory,In a city of coaching centers known to train I...,in a city of coaching centers known to train i...


In [6]:
import nltk

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [8]:
netflix_data_copy['clean_desc'] = netflix_data_copy['clean_desc'].apply(word_tokenize)
netflix_data_copy['clean_desc'] = netflix_data_copy['clean_desc'].apply(
    lambda x:[word for word in x if word not in set(stopwords.words('english'))]
)

In [9]:
netflix_data_copy['clean_desc'] = netflix_data_copy['clean_desc'].apply(lambda x: ' '.join(x))

In [10]:
netflix_data_copy.head()

Unnamed: 0_level_0,title,description,clean_desc
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
s1,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...",father nears end life filmmaker kirsten johnso...
s2,Blood & Water,"After crossing paths at a party, a Cape Town t...",crossing paths party cape town teen sets prove...
s3,Ganglands,To protect his family from a powerful drug lor...,protect family powerful drug lord skilled thie...
s4,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...",feuds flirtations toilet talk go among incarce...
s5,Kota Factory,In a city of coaching centers known to train I...,city coaching centers known train india finest...


In [11]:
netflix_data_copy.clean_desc[2]

'protect family powerful drug lord skilled thief mehdi expert team robbers pulled violent deadly turf war'

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf = TfidfVectorizer(min_df = 2, max_df = 0.7)

X = tfidf.fit_transform(netflix_data_copy['clean_desc'])

# print(tfidf.get_feature_names())

In [17]:
import pandas as pd

In [14]:
tfidf_df = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names())

tfidf_df.index = netflix_data_copy['title']

tfidf_df.head()



Unnamed: 0_level_0,aamir,aaron,abandoned,abandons,abbey,abduct,abducted,abduction,abductors,abducts,...,zero,zion,zip,zodiac,zoe,zombie,zombies,zone,zoo,zoom
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dick Johnson Is Dead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood & Water,0.0,0.0,0.0,0.0,0.0,0.0,0.285528,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ganglands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jailbirds New Orleans,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kota Factory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
tfidf_df.to_csv("data/tfidf_data.csv")

In [16]:
tfidf_data = pd.read_csv("data/tfidf_data.csv", index_col=0)
tfidf_data.head()

Unnamed: 0_level_0,aamir,aaron,abandoned,abandons,abbey,abduct,abducted,abduction,abductors,abducts,...,zero,zion,zip,zodiac,zoe,zombie,zombies,zone,zoo,zoom
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dick Johnson Is Dead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood & Water,0.0,0.0,0.0,0.0,0.0,0.0,0.285528,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ganglands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jailbirds New Orleans,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kota Factory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
import pandas as pd

In [6]:
tfidf_feather = pd.read_feather("data/tfidf_data.feather")
tfidf_feather = tfidf_feather.set_index('title')
tfidf_feather.head()

Unnamed: 0_level_0,aamir,aaron,abandoned,abandons,abbey,abduct,abducted,abduction,abductors,abducts,...,zero,zion,zip,zodiac,zoe,zombie,zombies,zone,zoo,zoom
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dick Johnson Is Dead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood & Water,0.0,0.0,0.0,0.0,0.0,0.0,0.285528,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ganglands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jailbirds New Orleans,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kota Factory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
import pickle
with open("movie_list.pickle", "wb") as f:
   pickle.dump(netflix_data['title'].unique(), f)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
cosine_similarity_array = cosine_similarity(tfidf_df)

In [20]:
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, columns = tfidf_df.index, 
                                    index = tfidf_df.index)

cosine_similarity_df.head()

title,Dick Johnson Is Dead,Blood & Water,Ganglands,Jailbirds New Orleans,Kota Factory,Midnight Mass,My Little Pony: A New Generation,Sankofa,The Great British Baking Show,The Starling,...,Zak Storm,Zed Plus,Zenda,Zindagi Gulzar Hai,Zinzana,Zodiac,Zombie Dumb,Zombieland,Zoom,Zubaan
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dick Johnson Is Dead,1.0,0.0,0.0,0.0,0.017282,0.0,0.0,0.0,0.047362,0.019817,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015987,0.0
Blood & Water,0.0,1.0,0.0,0.0,0.0,0.031762,0.05597,0.0,0.0,0.0,...,0.037436,0.033547,0.111135,0.0,0.033345,0.0,0.038512,0.0,0.0,0.0
Ganglands,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022301
Jailbirds New Orleans,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015281,0.0
Kota Factory,0.017282,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.018924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043587,0.071094,0.0


In [21]:
cosine_similarity_df.shape

(8807, 8807)

In [22]:
cosine_similarity_series = cosine_similarity_df.loc['Chhota Bheem']

ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

print(ordered_similarities[1:].head(10))

title
Little Singham                    0.225491
Tayo the Little Bus               0.221442
Bo on the Go!                     0.209406
Yoko and His Friends              0.191241
Chhota Bheem in African Safari    0.170218
Power Rangers Ninja Steel         0.169837
Mighty Raju                       0.156825
Cinema Bandi                      0.146526
Uma Maheswara Ugra Roopasya       0.143172
The Letter Reader                 0.139759
Name: Chhota Bheem, dtype: float64


In [23]:
list_of_movies_enjoyed = ["Blood & Water", "Chhota Bheem", "Mighty Raju"]

movie_enjoyed_df = tfidf_df.reindex(list_of_movies_enjoyed)

movie_enjoyed_df.head()

Unnamed: 0_level_0,aamir,aaron,abandoned,abandons,abbey,abduct,abducted,abduction,abductors,abducts,...,zero,zion,zip,zodiac,zoe,zombie,zombies,zone,zoo,zoom
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Blood & Water,0.0,0.0,0.0,0.0,0.0,0.0,0.285528,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chhota Bheem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mighty Raju,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
user_prof = movie_enjoyed_df.mean()

print(user_prof)

aamir        0.0
aaron        0.0
abandoned    0.0
abandons     0.0
abbey        0.0
            ... 
zombie       0.0
zombies      0.0
zone         0.0
zoo          0.0
zoom         0.0
Length: 9904, dtype: float64


In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
tfidf_subset_df = tfidf_df.drop(list_of_movies_enjoyed, axis=0)

# Calculate the cosine_similarity and wrap it in a DataFrame
similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])

# Sort the values from high to low by the values in the similarity_score
sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False)

In [27]:
sorted_similarity_df.head(15)

Unnamed: 0_level_0,similarity_score
title,Unnamed: 1_level_1
Power Rangers Ninja Steel,0.203721
Legacies,0.182585
Bo on the Go!,0.158947
Break Ke Baad,0.156441
Little Singham,0.143889
Dinotrux Supercharged,0.14297
Magi: Adventure of Sinbad,0.134618
Race to Witch Mountain,0.134294
Tayo the Little Bus,0.121649
Cyborg 009: Call of Justice,0.120927


In [28]:
netflix_data_genre = netflix_data[['title', 'listed_in']]

netflix_data_genre.head()

Unnamed: 0_level_0,title,listed_in
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1
s1,Dick Johnson Is Dead,Documentaries
s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries"
s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act..."
s4,Jailbirds New Orleans,"Docuseries, Reality TV"
s5,Kota Factory,"International TV Shows, Romantic TV Shows, TV ..."


In [29]:
dummies = netflix_data_genre['listed_in'].str.get_dummies(sep=', ')

netflix_data_genre_dummy = pd.concat([netflix_data_genre, dummies], axis=1)

netflix_data_genre_dummy.head()

Unnamed: 0_level_0,title,listed_in,Action & Adventure,Anime Features,Anime Series,British TV Shows,Children & Family Movies,Classic & Cult TV,Classic Movies,Comedies,...,TV Action & Adventure,TV Comedies,TV Dramas,TV Horror,TV Mysteries,TV Sci-Fi & Fantasy,TV Shows,TV Thrillers,Teen TV Shows,Thrillers
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s1,Dick Johnson Is Dead,Documentaries,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries",0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act...",0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
s4,Jailbirds New Orleans,"Docuseries, Reality TV",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
s5,Kota Factory,"International TV Shows, Romantic TV Shows, TV ...",0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [30]:
netflix_data_genre_dummy.drop('listed_in', axis=1, inplace=True)

In [31]:
netflix_data_genre_dummy = netflix_data_genre_dummy.set_index('title')

netflix_data_genre_dummy.head()

Unnamed: 0_level_0,Action & Adventure,Anime Features,Anime Series,British TV Shows,Children & Family Movies,Classic & Cult TV,Classic Movies,Comedies,Crime TV Shows,Cult Movies,...,TV Action & Adventure,TV Comedies,TV Dramas,TV Horror,TV Mysteries,TV Sci-Fi & Fantasy,TV Shows,TV Thrillers,Teen TV Shows,Thrillers
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dick Johnson Is Dead,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Blood & Water,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
Ganglands,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
Jailbirds New Orleans,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Kota Factory,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [32]:
from scipy.spatial.distance import pdist, squareform

In [33]:
jaccard_distances = pdist(netflix_data_genre_dummy, metric='jaccard')

jaccard_similarity_array = 1 - squareform(jaccard_distances)

jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, columns=netflix_data_genre_dummy.index, index=netflix_data_genre_dummy.index)

In [34]:
jaccard_similarity_df.head()

title,Dick Johnson Is Dead,Blood & Water,Ganglands,Jailbirds New Orleans,Kota Factory,Midnight Mass,My Little Pony: A New Generation,Sankofa,The Great British Baking Show,The Starling,...,Zak Storm,Zed Plus,Zenda,Zindagi Gulzar Hai,Zinzana,Zodiac,Zombie Dumb,Zombieland,Zoom,Zubaan
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dick Johnson Is Dead,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blood & Water,0.0,1.0,0.2,0.0,0.2,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
Ganglands,0.0,0.2,1.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
Jailbirds New Orleans,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kota Factory,0.0,0.2,0.2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.2,0.0,0.0,0.0


In [37]:
jaccard_similarity_series = jaccard_similarity_df.loc['Blood & Water']

ordered_similarities = jaccard_similarity_series.sort_values(ascending=False)

In [38]:
ordered_similarities.head(20)

title
46                   1.0
Kissing Game         1.0
Re:Mind              1.0
Osmosis              1.0
Secret City          1.0
Blood & Water        1.0
Tabula Rasa          1.0
Into the Night       1.0
Switched             1.0
We Are the Wave      1.0
Between              1.0
More to Say          1.0
To the Lake          1.0
The Rain             1.0
Disappearance        1.0
Girl from Nowhere    1.0
The Ghost Bride      1.0
Jinn                 1.0
On Children          1.0
Open Your Eyes       1.0
Name: Blood & Water, dtype: float64

In [1]:
import pandas as pd
df = pd.read_csv("data/tfidf_data.csv")
df.head()

Unnamed: 0,title,aamir,aaron,abandoned,abandons,abbey,abduct,abducted,abduction,abductors,...,zero,zion,zip,zodiac,zoe,zombie,zombies,zone,zoo,zoom
0,Dick Johnson Is Dead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Blood & Water,0.0,0.0,0.0,0.0,0.0,0.0,0.285528,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ganglands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Jailbirds New Orleans,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Kota Factory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.to_feather("data/tfidf_data.feather")

In [4]:
df1 = pd.read_feather("data/tfidf_data.feather")
df1.head()

Unnamed: 0,title,aamir,aaron,abandoned,abandons,abbey,abduct,abducted,abduction,abductors,...,zero,zion,zip,zodiac,zoe,zombie,zombies,zone,zoo,zoom
0,Dick Johnson Is Dead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Blood & Water,0.0,0.0,0.0,0.0,0.0,0.0,0.285528,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ganglands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Jailbirds New Orleans,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Kota Factory,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
