In [53]:
# In this project, I possess proficiency in comprehensive pandas operations, adeptly handling missing data, proficient Python coding, robust model development, insightful data visualization, and meticulous exploratory data analysis.

In [1]:
# Import pandas to deal with dataframe 
# Import warnings to avoid unwanted/unneccesary instructios
# Import CountVectorizer for transform a data in understandable to machines
# Import cosine similarity for to find relation between internal sentences
# Import difflib for just a extract feature names

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import difflib

#  Basic Pandas operation

In [3]:
# read a data frame to work with data 
df = pd.read_csv("D:\\practice_data_set\\netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
# check dimensionality/shape of data frame
df.shape

(8807, 12)

In [5]:
# check information about data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [6]:
# Create a new dataframe collect only required sample
df1 = df[['type','title','director','cast','description']]
df1.head()

Unnamed: 0,type,title,director,cast,description
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,"As her father nears the end of his life, filmm..."
1,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","After crossing paths at a party, a Cape Town t..."
2,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",To protect his family from a powerful drug lor...
3,TV Show,Jailbirds New Orleans,,,"Feuds, flirtations and toilet talk go down amo..."
4,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",In a city of coaching centers known to train I...


In [7]:
# Check how many values in our dataframe are duplicated
df1.duplicated().sum()

0

In [8]:
# to find a null values in dataframe and avoiding a boolean use a sum function 
df1.isnull().sum()

type              0
title             0
director       2634
cast            825
description       0
dtype: int64

In [9]:
# with the help of fillna fill a blank values, here just enter a empty string because in count vectorizer not calculate space
df1.fillna(' ',inplace=True)

In [10]:
df1.isnull().sum()

type           0
title          0
director       0
cast           0
description    0
dtype: int64

# New DataFrame

In [11]:
# here i can create a new dataframe for apply countvectorizer algorithm.
# Combined a features find correct relations in between sentences.
combined_features = df1['title'] +' '+ df1['director'] +' '+ df1['cast'] +' '+ df1['description']
combined_features

0       Dick Johnson Is Dead Kirsten Johnson   As her ...
1       Blood & Water   Ama Qamata, Khosi Ngema, Gail ...
2       Ganglands Julien Leclercq Sami Bouajila, Tracy...
3       Jailbirds New Orleans     Feuds, flirtations a...
4       Kota Factory   Mayur More, Jitendra Kumar, Ran...
                              ...                        
8802    Zodiac David Fincher Mark Ruffalo, Jake Gyllen...
8803    Zombie Dumb     While living alone in a spooky...
8804    Zombieland Ruben Fleischer Jesse Eisenberg, Wo...
8805    Zoom Peter Hewitt Tim Allen, Courteney Cox, Ch...
8806    Zubaan Mozez Singh Vicky Kaushal, Sarah-Jane D...
Length: 8807, dtype: object

In [12]:
# access only zeroth index sentence
combined_features[0]

'Dick Johnson Is Dead Kirsten Johnson   As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.'

In [13]:
df1['combined_features'] = combined_features
df1.head()

Unnamed: 0,type,title,director,cast,description,combined_features
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,"As her father nears the end of his life, filmm...",Dick Johnson Is Dead Kirsten Johnson As her ...
1,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","After crossing paths at a party, a Cape Town t...","Blood & Water Ama Qamata, Khosi Ngema, Gail ..."
2,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",To protect his family from a powerful drug lor...,"Ganglands Julien Leclercq Sami Bouajila, Tracy..."
3,TV Show,Jailbirds New Orleans,,,"Feuds, flirtations and toilet talk go down amo...","Jailbirds New Orleans Feuds, flirtations a..."
4,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",In a city of coaching centers known to train I...,"Kota Factory Mayur More, Jitendra Kumar, Ran..."


In [14]:
df2 = df1[['type','combined_features']]
df2.head()

Unnamed: 0,type,combined_features
0,Movie,Dick Johnson Is Dead Kirsten Johnson As her ...
1,TV Show,"Blood & Water Ama Qamata, Khosi Ngema, Gail ..."
2,TV Show,"Ganglands Julien Leclercq Sami Bouajila, Tracy..."
3,TV Show,"Jailbirds New Orleans Feuds, flirtations a..."
4,TV Show,"Kota Factory Mayur More, Jitendra Kumar, Ran..."


# CountVectorizer Model

In [15]:
# Build a model of count vectorizer
cv = CountVectorizer()

In [16]:
cv.fit_transform(df2['combined_features'])

<8807x53215 sparse matrix of type '<class 'numpy.int64'>'
	with 350550 stored elements in Compressed Sparse Row format>

In [17]:
vectors = cv.fit_transform(df2['combined_features']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
# Here just access zeroth index for a understanding
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
# access feature names/column names
cv.get_feature_names_out()

array(['000', '007', '009', ..., '잡는다', '최강전사', '탄생'], dtype=object)

In [20]:
# to find cosine similarities of vectors because find a similarity between sentences with the help of cosine similarity
similarities = cosine_similarity(vectors)

In [21]:
similarities

array([[1.        , 0.05345225, 0.16699314, ..., 0.09548198, 0.08989331,
        0.21295885],
       [0.05345225, 1.        , 0.01673655, ..., 0.01913898, 0.        ,
        0.        ],
       [0.16699314, 0.01673655, 1.        , ..., 0.08968971, 0.04222003,
        0.22004401],
       ...,
       [0.09548198, 0.01913898, 0.08968971, ..., 1.        , 0.09656091,
        0.0457509 ],
       [0.08989331, 0.        , 0.04222003, ..., 0.09656091, 1.        ,
        0.04307305],
       [0.21295885, 0.        , 0.22004401, ..., 0.0457509 , 0.04307305,
        1.        ]])

In [22]:
similarities.shape

(8807, 8807)

In [23]:
similarities[1]

array([0.05345225, 1.        , 0.01673655, ..., 0.01913898, 0.        ,
       0.        ])

In [24]:
# Access title from data frame
df1['title']

0        Dick Johnson Is Dead
1               Blood & Water
2                   Ganglands
3       Jailbirds New Orleans
4                Kota Factory
                ...          
8802                   Zodiac
8803              Zombie Dumb
8804               Zombieland
8805                     Zoom
8806                   Zubaan
Name: title, Length: 8807, dtype: object

In [25]:
# Title are converted into an list for to find a best_match
Show_list = df1['title'].to_list()
Show_list

['Dick Johnson Is Dead',
 'Blood & Water',
 'Ganglands',
 'Jailbirds New Orleans',
 'Kota Factory',
 'Midnight Mass',
 'My Little Pony: A New Generation',
 'Sankofa',
 'The Great British Baking Show',
 'The Starling',
 'Vendetta: Truth, Lies and The Mafia',
 'Bangkok Breaking',
 'Je Suis Karl',
 'Confessions of an Invisible Girl',
 'Crime Stories: India Detectives',
 'Dear White People',
 "Europe's Most Dangerous Man: Otto Skorzeny in Spain",
 'Falsa identidad',
 'Intrusion',
 'Jaguar',
 'Monsters Inside: The 24 Faces of Billy Milligan',
 'Resurrection: Ertugrul',
 'Avvai Shanmughi',
 'Go! Go! Cory Carson: Chrissy Takes the Wheel',
 'Jeans',
 'Love on the Spectrum',
 'Minsara Kanavu',
 'Grown Ups',
 'Dark Skies',
 'Paranoia',
 'Ankahi Kahaniya',
 'Chicago Party Aunt',
 'Sex Education',
 'Squid Game',
 'Tayo and Little Wizards',
 'The Father Who Moves Mountains',
 'The Stronghold',
 'Angry Birds',
 'Birth of the Dragon',
 'Chhota Bheem',
 'He-Man and the Masters of the Universe',
 'Jaws

# Work with title

In [26]:
Show_name = 'Blood & Water'

In [27]:
best_match = difflib.get_close_matches(Show_name,Show_list)
best_match

['Blood & Water', 'Blood Father', 'Blood Pact']

In [28]:
close_match = best_match[0]
close_match

'Blood & Water'

In [29]:
index_of_movie = df1[df1['title'] == close_match]
index_of_movie

Unnamed: 0,type,title,director,cast,description,combined_features
1,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","After crossing paths at a party, a Cape Town t...","Blood & Water Ama Qamata, Khosi Ngema, Gail ..."


In [30]:
# access index of according to title of Movie and TV Shows using that index we find a similarity/relations in between their 
title_index = df1[df1['title']=='Blood & Water'].index[0]
title_index

1

In [31]:
# Check similarity according to index
similarities[1]

array([0.05345225, 1.        , 0.01673655, ..., 0.01913898, 0.        ,
       0.        ])

In [32]:
# it can give 100 sample that are matches to the index  
similarities[1][:100]

array([0.05345225, 1.        , 0.01673655, 0.03929887, 0.01822707,
       0.04591   , 0.04708236, 0.0348684 , 0.        , 0.08368274,
       0.02146694, 0.01380131, 0.06629935, 0.04708236, 0.        ,
       0.05020964, 0.08717101, 0.04591   , 0.06694619, 0.07377111,
       0.        , 0.0174342 , 0.03877834, 0.06332476, 0.        ,
       0.01801875, 0.05345225, 0.03194383, 0.01781742, 0.01822707,
       0.        , 0.06388766, 0.06761234, 0.01517942, 0.        ,
       0.06446584, 0.01657484, 0.01889822, 0.0334731 , 0.02020305,
       0.02760262, 0.01707469, 0.03524537, 0.08287419, 0.05020964,
       0.01889822, 0.04834938, 0.01673655, 0.04925318, 0.        ,
       0.05286805, 0.02857143, 0.01150109, 0.06121334, 0.02877772,
       0.040996  , 0.03060667, 0.03011693, 0.02562839, 0.01328032,
       0.0518563 , 0.01482499, 0.02760262, 0.03060667, 0.        ,
       0.02112886, 0.0225877 , 0.03380617, 0.        , 0.020498  ,
       0.0404061 , 0.01517942, 0.0311211 , 0.0360375 , 0.     

In [33]:
# here, it can access one by one value and append in list
match_list = list(enumerate(similarities[title_index]))
match_list

[(0, 0.05345224838248488),
 (1, 0.9999999999999983),
 (2, 0.016736548175114458),
 (3, 0.03929887459459296),
 (4, 0.01822706541441223),
 (5, 0.04591000258371299),
 (6, 0.04708236154307583),
 (7, 0.03486840218772033),
 (8, 0.0),
 (9, 0.08368274087557229),
 (10, 0.021466939537054593),
 (11, 0.013801311186847081),
 (12, 0.06629935441317959),
 (13, 0.04708236154307583),
 (14, 0.0),
 (15, 0.05020964452534338),
 (16, 0.08717100546930083),
 (17, 0.04591000258371299),
 (18, 0.06694619270045783),
 (19, 0.07377111135633176),
 (20, 0.0),
 (21, 0.017434201093860166),
 (22, 0.038778336716474064),
 (23, 0.06332475868613036),
 (24, 0.0),
 (25, 0.01801874925391118),
 (26, 0.05345224838248488),
 (27, 0.031943828249996996),
 (28, 0.017817416127494958),
 (29, 0.01822706541441223),
 (30, 0.0),
 (31, 0.06388765649999399),
 (32, 0.06761234037828133),
 (33, 0.015179418517972908),
 (34, 0.0),
 (35, 0.06446583712203043),
 (36, 0.016574838603294898),
 (37, 0.01889822365046136),
 (38, 0.033473096350228916),
 (39,

In [34]:
sorted(match_list,reverse = True)

[(8806, 0.0),
 (8805, 0.0),
 (8804, 0.019138975058773818),
 (8803, 0.026082026547865053),
 (8802, 0.0),
 (8801, 0.03779644730092272),
 (8800, 0.01725163898355886),
 (8799, 0.03733266964732787),
 (8798, 0.01822706541441223),
 (8797, 0.05175491695067658),
 (8796, 0.0283695078181321),
 (8795, 0.018666334823663935),
 (8794, 0.0),
 (8793, 0.044136741475237475),
 (8792, 0.019138975058773818),
 (8791, 0.0836827408755723),
 (8790, 0.03283545516515593),
 (8789, 0.016417727582577965),
 (8788, 0.03414938883812553),
 (8787, 0.033149677206589796),
 (8786, 0.09498713802919552),
 (8785, 0.01822706541441223),
 (8784, 0.0),
 (8783, 0.07859774918918593),
 (8782, 0.09035079029052512),
 (8781, 0.031943828249996996),
 (8780, 0.0),
 (8779, 0.0),
 (8778, 0.0),
 (8777, 0.07559289460184544),
 (8776, 0.033806170189140665),
 (8775, 0.019138975058773818),
 (8774, 0.03138824102871722),
 (8773, 0.05976143046671968),
 (8772, 0.04749356901459776),
 (8771, 0.02898855178262242),
 (8770, 0.03138824102871722),
 (8769, 0.

In [35]:
# Here use itemgetter for sortting value withindex
sorted_match = sorted(match_list,key = itemgetter(1,0),reverse = True)

In [36]:
sorted_match

[(1, 0.9999999999999983),
 (4692, 0.1781741612749496),
 (1922, 0.1569078098447415),
 (6277, 0.1493306785893115),
 (5025, 0.1493306785893115),
 (3323, 0.14602041508114227),
 (1403, 0.14374722712498647),
 (6405, 0.1414213562373095),
 (679, 0.14098147537004826),
 (6788, 0.13944333775567924),
 (3516, 0.13389238540091566),
 (6052, 0.13259870882635919),
 (3493, 0.13259870882635919),
 (3492, 0.13259870882635919),
 (1162, 0.13259870882635919),
 (4483, 0.13134182066062372),
 (4202, 0.1301200097264711),
 (2960, 0.1301200097264711),
 (1260, 0.12893167424406085),
 (895, 0.12880163722232754),
 (3584, 0.12857142857142856),
 (1111, 0.12777531299998798),
 (7552, 0.1275894579008856),
 (5583, 0.1267731382092775),
 (8202, 0.12677313820927746),
 (3220, 0.12677313820927746),
 (6650, 0.12613124477737825),
 (6994, 0.12472191289246472),
 (5324, 0.12472191289246472),
 (8338, 0.12448440881789545),
 (5641, 0.12448440881789545),
 (2904, 0.12344267996967352),
 (1825, 0.12344267996967351),
 (8746, 0.123358790948792

In [37]:
# sorts and extracts the top 6 index-similarity pairs from a similarity list, based on similarity values, in descending order.
sorted(list(enumerate(similarities[0])),key=(lambda x : x[1]),reverse = True)[0:6]

[(0, 1.0),
 (5233, 0.4427188724235731),
 (7015, 0.42459591394742013),
 (6660, 0.38245085529570716),
 (854, 0.3597091616898077),
 (4125, 0.3544587784792833)]

In [38]:
# Create a function reccomend five movie/tvshows according to their similarity

In [39]:
# create a function for closest Movies/Tv shows
def reccomended(movie):
    movie_index = df1[df1["title"] == movie].index[0]
    dist = similarities[movie_index]
    movie_list = sorted(list(enumerate(dist)),key=(lambda x : x[1]),reverse = True)[0:6]

    for i in movie_list:
        #print(i[0])
         print(df1.iloc[i[0]].title)

In [40]:
reccomended('Blood & Water')

Blood & Water
Mom
The Parkers
Veronica
Beiimaan Love
The Boy Who Cried Werewolf


# Code for a type

In [41]:
# access index of show type
type_index = df1[df1['type'] == 'TV Show'].index[0]
type_index

1

In [42]:
# Code for a show type 
shows = df1[df1['type'] == 'TV Show']
movies = df1[df1['type'] == 'Movie']
movie_index = df1[df1["title"] == 'Blood & Water'].index[0]

if movie_index in shows.index:
    print('TV Show')
elif movie_index in movies.index:
    print('Movie')
else:
    None

TV Show


In [43]:
# Create a function for identification of shows_type give a title they can give it's type
def show_type_fun(show_name):
    shows = df1[df1['type'] == 'TV Show']
    movies = df1[df1['type'] == 'Movie']
    movie_index = df1[df1["title"] == show_name].index[0]
    if movie_index in shows.index:
        print('TV Show')
    elif movie_index in movies.index:
        print('Movie')
    else:
        None

In [44]:
show_type_fun('Blood & Water')

TV Show


In [45]:
show_type_fun('Dick Johnson Is Dead')

Movie


In [46]:
show_type_fun('Kota Factory')	

TV Show


# Combination of two function

In [47]:
# combined functions for if we pass a title they can give to both of you i.e. Recommendation as well as show type of that title
def combined_show_function(show):
    # Code from reccomendation
    movie_index = df1[df1["title"] == show].index[0]
    dist = similarities[movie_index]
    movie_list = sorted(list(enumerate(dist)), key=(lambda x: x[1]), reverse=True)[0:6]

    for i in movie_list:
        print(df1.iloc[i[0]].title)

    # Code from show_type
    shows = df1[df1['type'] == 'TV Show']
    movies = df1[df1['type'] == 'Movie']
    movie_index = df1[df1["title"] == show].index[0]
    if movie_index in shows.index:
        print('TV Show')
    elif movie_index in movies.index:
        print('Movie')
    else:
        None

In [48]:
combined_show_function('Blood & Water')

Blood & Water
Mom
The Parkers
Veronica
Beiimaan Love
The Boy Who Cried Werewolf
TV Show


In [49]:
combined_show_function('Dick Johnson Is Dead')

Dick Johnson Is Dead
The Death and Life of Marsha P. Johnson
How to Be a Player
Earth's Natural Wonders: Life at the Extremes
Creating an Army of the Dead
The Last of the Schmucks
Movie


In [50]:
combined_show_function('Kota Factory')

Kota Factory
Muqaddar ka Faisla
Duniya
Prague
Krish Trish and Baltiboy: Battle of Wits
Jada
TV Show


In [51]:
combined_show_function('Sankofa')

Sankofa
The Alcàsser Murders
In Family I Trust
Motu Patlu: King of Kings
Terrorism Close Calls
Born in Gaza
Movie


In [52]:
combined_show_function('Jaws')

Jaws
Jaws 2
Jaws: The Revenge
The Dragon Prince
The 9th Precinct
Jackie Brown
Movie
