In [1]:
# import pandas to deal with dataframe 
# import warnings to avoid unwanted/unneccesary instructios
# import CountVectorizer for transform a data in understandable to machines
# import cosine similarity for to find relation between internal sentences

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# read a data frame to work with data 
df = pd.read_csv("D:\\practice_data_set\\netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
# check dimensionality/shape of data frame
df.shape

(8807, 12)

In [5]:
# check information about data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [6]:
# Create a new dataframe collect only required sample
df1 = df[['type','title','director','cast','description']]
df1.head()

Unnamed: 0,type,title,director,cast,description
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,"As her father nears the end of his life, filmm..."
1,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","After crossing paths at a party, a Cape Town t..."
2,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",To protect his family from a powerful drug lor...
3,TV Show,Jailbirds New Orleans,,,"Feuds, flirtations and toilet talk go down amo..."
4,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",In a city of coaching centers known to train I...


In [7]:
# Check how many values in our dataframe are duplicated
df1.duplicated().sum()

0

In [8]:
# to find a null values in dataframe and avoiding a boolean use a sum function 
df1.isnull().sum()

type              0
title             0
director       2634
cast            825
description       0
dtype: int64

In [9]:
# with the help of fillna fill a blank values, here just enter a empty string because in count vectorizer not calculate space
df1.fillna(' ',inplace=True)

In [10]:
df1.isnull().sum()

type           0
title          0
director       0
cast           0
description    0
dtype: int64

In [11]:
# Combined a features find correct relations in between sentences
combined_features = df1['title'] +' '+ df1['director'] +' '+ df1['cast'] +' '+ df1['description']
combined_features

0       Dick Johnson Is Dead Kirsten Johnson   As her ...
1       Blood & Water   Ama Qamata, Khosi Ngema, Gail ...
2       Ganglands Julien Leclercq Sami Bouajila, Tracy...
3       Jailbirds New Orleans     Feuds, flirtations a...
4       Kota Factory   Mayur More, Jitendra Kumar, Ran...
                              ...                        
8802    Zodiac David Fincher Mark Ruffalo, Jake Gyllen...
8803    Zombie Dumb     While living alone in a spooky...
8804    Zombieland Ruben Fleischer Jesse Eisenberg, Wo...
8805    Zoom Peter Hewitt Tim Allen, Courteney Cox, Ch...
8806    Zubaan Mozez Singh Vicky Kaushal, Sarah-Jane D...
Length: 8807, dtype: object

In [12]:
# access only zeroth index sentence
combined_features[0]

'Dick Johnson Is Dead Kirsten Johnson   As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'

In [13]:
df1['combined_features'] = combined_features
df1.head()

Unnamed: 0,type,title,director,cast,description,combined_features
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,"As her father nears the end of his life, filmm...",Dick Johnson Is Dead Kirsten Johnson As her ...
1,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","After crossing paths at a party, a Cape Town t...","Blood & Water Ama Qamata, Khosi Ngema, Gail ..."
2,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",To protect his family from a powerful drug lor...,"Ganglands Julien Leclercq Sami Bouajila, Tracy..."
3,TV Show,Jailbirds New Orleans,,,"Feuds, flirtations and toilet talk go down amo...","Jailbirds New Orleans Feuds, flirtations a..."
4,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",In a city of coaching centers known to train I...,"Kota Factory Mayur More, Jitendra Kumar, Ran..."


In [14]:
df2 = df1[['type','combined_features']]
df2.head()

Unnamed: 0,type,combined_features
0,Movie,Dick Johnson Is Dead Kirsten Johnson As her ...
1,TV Show,"Blood & Water Ama Qamata, Khosi Ngema, Gail ..."
2,TV Show,"Ganglands Julien Leclercq Sami Bouajila, Tracy..."
3,TV Show,"Jailbirds New Orleans Feuds, flirtations a..."
4,TV Show,"Kota Factory Mayur More, Jitendra Kumar, Ran..."


In [15]:
# create a model of count vectorizer
cv = CountVectorizer()

In [16]:
vectors = cv.fit_transform(df2['combined_features']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
# access feature names/column names
cv.get_feature_names_out()

array(['000', '007', '009', ..., '잡는다', '최강전사', '탄생'], dtype=object)

In [19]:
# to find cosine similarities because find a similarity between sentences with the help of cosine similarity
similarities = cosine_similarity(vectors)

In [20]:
similarities

array([[1.        , 0.05345225, 0.16699314, ..., 0.09548198, 0.08989331,
        0.21295885],
       [0.05345225, 1.        , 0.01673655, ..., 0.01913898, 0.        ,
        0.        ],
       [0.16699314, 0.01673655, 1.        , ..., 0.08968971, 0.04222003,
        0.22004401],
       ...,
       [0.09548198, 0.01913898, 0.08968971, ..., 1.        , 0.09656091,
        0.0457509 ],
       [0.08989331, 0.        , 0.04222003, ..., 0.09656091, 1.        ,
        0.04307305],
       [0.21295885, 0.        , 0.22004401, ..., 0.0457509 , 0.04307305,
        1.        ]])

In [21]:
similarities.shape

(8807, 8807)

In [22]:
similarities[1]

array([0.05345225, 1.        , 0.01673655, ..., 0.01913898, 0.        ,
       0.        ])

In [23]:
# access index of movie name 
sorted_match = df1[df1['title']=='Blood & Water'].index[0]
sorted_match

1

In [24]:
# access index of show type
show_type = df1[df1['type'] == 'TV Show'].index[0]
show_type

1

In [25]:
# similarities give distance of movie with the help of index, enumerate give the enumerate object of movie list use because to change enumerate 
# object in the particular format and sorted use for to sort movie distance(matches)
sorted(list(enumerate(similarities[0])),key=(lambda x : x[1]),reverse = True)[0:6]

[(0, 1.0),
 (5233, 0.4427188724235731),
 (7015, 0.42459591394742013),
 (6660, 0.38245085529570716),
 (854, 0.3597091616898077),
 (4125, 0.3544587784792833)]

In [26]:
# create a function for closest Movies/Tv shows
def reccomended(movie):
    movie_index = df1[df1["title"] == movie].index[0]
    dist = similarities[movie_index]
    movie_list = sorted(list(enumerate(dist)),key=(lambda x : x[1]),reverse = True)[0:6]

    for i in movie_list:
        #print(i[0])
         print(df1.iloc[i[0]].title)

In [27]:
reccomended('Blood & Water')

Blood & Water
Mom
The Parkers
Veronica
Beiimaan Love
The Boy Who Cried Werewolf


In [28]:
# Create a function for identification of shows_type give a title they can give it's type
def show_type(movie):
    show_types_at_indices = df1.loc[sorted_match, "type"]
    for j in df1['type']:
     print(show_types_at_indices)
     break

In [29]:
show_type('Blood & Water')

TV Show


In [30]:
# combined functions for if we pass a title they can give to both of you i.e. Recommendation as well as show type of that title
def combined_show_function(show):
    # Code from reccomendation
    movie_index = df1[df1["title"] == show].index[0]
    dist = similarities[movie_index]
    movie_list = sorted(list(enumerate(dist)), key=(lambda x: x[1]), reverse=True)[0:6]

    for i in movie_list:
        print(df1.iloc[i[0]].title)

    # Code from show_type
    show_types_at_indices = df1.loc[sorted_match, "type"]
    for j in df1['type']:
        print(show_types_at_indices)
        break

# Example usage
combined_show_function('Blood & Water')

Blood & Water
Mom
The Parkers
Veronica
Beiimaan Love
The Boy Who Cried Werewolf
TV Show
