<a href="https://colab.research.google.com/github/pravin-nawghare/Analyzing-Netflix-Content/blob/main/Netflix_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np

In [2]:
# read the dataset
df = pd.read_csv("/content/netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
# all columns in our dataset
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [4]:
# explore the datatype, total records and nan values in data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [5]:
# null values in every column
df.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2634
cast,825
country,831
date_added,10
release_year,0
rating,4
duration,3


In [6]:
# dropping rows with nan values
df.dropna(subset = ['cast', 'country','rating', 'duration', 'date_added'], inplace=True)

In [7]:
# new shape of data after dropping nan values
df.shape

(7290, 12)

In [8]:
#converting date column in mysql supported date format
df['date_added'] = pd.to_datetime(df['date_added'], format='mixed', dayfirst=True)
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,2021-09-24,2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...


In [10]:
def split_duration(row):
  '''
  This function will split the column duration in two seperate columns 'seasons' and 'minutes'.
  0 is used to fill nan values. It dosen't hold any meaning
  '''
  if 'Seasons' in row['duration']:
        return pd.Series([row['duration'].split()[0], 0])
  elif 'min' in row['duration']:
        return pd.Series([0, row['duration'].split()[0]])
  else:
        return pd.Series([0, 0])

# Apply the function to the DataFrame
df[['seasons', 'minutes']] = df.apply(split_duration, axis=1)

# Convert columns to appropriate data types
df['seasons'] = df['seasons'].astype(int)
df['minutes'] = df['minutes'].astype(int)

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,seasons,minutes
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2,0
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2,0
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",0,125
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,2021-09-24,2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...,9,0
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...,0,104


In [11]:
# Fill NaN in director column with a category 'Unknown'
df.fillna({'director':'Unknown'}, inplace=True)
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,seasons,minutes
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2,0
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2,0
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",0,125
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,2021-09-24,2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...,9,0
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...,0,104


In [12]:
# now our data don't have any null values
df.isna().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,0
cast,0
country,0
date_added,0
release_year,0
rating,0
duration,0


In [13]:
def extract_cast(row):
  '''
  This function will extract the only 5 cast from the column cast .
  '''
  cast = str(row['cast']).split(', ')
  if len(cast) >= 5:
    return ', '.join(cast[:5])
  else:
    return ', '.join(cast)

# Applying the function to dataframe
df['top_5_cast'] = df.apply(extract_cast, axis=1)

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,seasons,minutes,top_5_cast
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2,0,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban..."
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2,0,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K..."
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",0,125,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,2021-09-24,2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...,9,0,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho..."
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...,0,104,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T..."


In [14]:
df.reset_index(inplace=True)

In [15]:
# finding the maximum length of string value in listed_in column for definig length of
# datatype in MySql
df['listed_in'].str.len().max()

79

In [16]:
# finding the index of maximum length record
df['listed_in'].str.len().idxmax()

463

In [17]:
# records with listed_in length more than 70
df[df['listed_in'].str.len() > 70]

Unnamed: 0,index,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,seasons,minutes,top_5_cast
254,401,s402,TV Show,Sky Rojo,Unknown,"Verónica Sánchez, Miguel Ángel Silvestre, Asie...",Spain,2021-07-23,2021,TV-MA,2 Seasons,"International TV Shows, Spanish-Language TV Sh...",A fatal turn of events at a brothel sends thre...,2,0,"Verónica Sánchez, Miguel Ángel Silvestre, Asie..."
463,744,s745,TV Show,Locombianos,Unknown,"Freddy Beltrán, Pamela Ospina, Diego Camargo, ...",Colombia,2021-06-10,2021,TV-MA,1 Season,"International TV Shows, Spanish-Language TV Sh...",Four of Colombia's funniest and bawdiest comed...,0,0,"Freddy Beltrán, Pamela Ospina, Diego Camargo, ..."
2317,2971,s2972,TV Show,Diablero,Unknown,"Christopher Von Uckermann, Horacio García Roja...",Mexico,2020-01-31,2020,TV-MA,2 Seasons,"International TV Shows, Spanish-Language TV Sh...","When a young girl goes missing in a big city, ...",2,0,"Christopher Von Uckermann, Horacio García Roja..."
2545,3232,s3233,TV Show,Zona Rosa,Unknown,"Manu NNa, Ana Julia Yeyé, Ray Contreras, Pablo...",Mexico,2019-11-26,2019,TV-MA,1 Season,"International TV Shows, Spanish-Language TV Sh...",An assortment of talent takes the stage for a ...,0,0,"Manu NNa, Ana Julia Yeyé, Ray Contreras, Pablo..."
2841,3587,s3588,TV Show,Victim Number 8,Unknown,"César Mateo, María de Nati, Verónika Moral, Iñ...",Spain,2019-08-16,2018,TV-MA,1 Season,"International TV Shows, Spanish-Language TV Sh...",No one can be trusted after a terrorist bombin...,0,0,"César Mateo, María de Nati, Verónika Moral, Iñ..."
6967,8424,s8425,TV Show,The Ministry of Time,Unknown,"Rodolfo Sancho, Aura Garrido, Nacho Fresneda, ...",Spain,2018-01-28,2017,TV-MA,3 Seasons,"International TV Shows, Spanish-Language TV Sh...",Three officers from different eras work for a ...,3,0,"Rodolfo Sancho, Aura Garrido, Nacho Fresneda, ..."


In [18]:
# drop columns which are not required
df.drop(columns=['cast', 'duration', 'index'], inplace=True)
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,listed_in,description,seasons,minutes,top_5_cast
0,s2,TV Show,Blood & Water,Unknown,South Africa,2021-09-24,2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2,0,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban..."
1,s5,TV Show,Kota Factory,Unknown,India,2021-09-24,2021,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2,0,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K..."
2,s8,Movie,Sankofa,Haile Gerima,"United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",0,125,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D..."
3,s9,TV Show,The Great British Baking Show,Andy Devonshire,United Kingdom,2021-09-24,2021,TV-14,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...,9,0,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho..."
4,s10,Movie,The Starling,Theodore Melfi,United States,2021-09-24,2021,PG-13,"Comedies, Dramas",A woman adjusting to life after a loss contend...,0,104,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T..."


In [19]:
# saving the new file as csv for further analysis in MySql
df.to_csv('/content/netflix_cleaned.csv')