In [1]:
import pandas as pd

In [2]:
#Load the Data
df_netflix = pd.read_csv('netflix.csv')

In [3]:
# Checking the DataFrame's Shape:
df_netflix.shape

(8790, 10)

In [4]:
#Printing DataFrame Information:
print(df_netflix.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8790 non-null   object
 1   type          8790 non-null   object
 2   title         8790 non-null   object
 3   director      8790 non-null   object
 4   country       8790 non-null   object
 5   date_added    8790 non-null   object
 6   release_year  8790 non-null   int64 
 7   rating        8790 non-null   object
 8   duration      8790 non-null   object
 9   listed_in     8790 non-null   object
dtypes: int64(1), object(9)
memory usage: 686.8+ KB
None


In [5]:
#Checking for Missing Values:
df_netflix.isnull().sum()

show_id         0
type            0
title           0
director        0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
dtype: int64

In [6]:
#Checking Data Types:
df_netflix.dtypes

show_id         object
type            object
title           object
director        object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
dtype: object

In [7]:
#Removing Duplicates:
df_netflix.duplicated(["type","title","release_year"]).sum()

3

In [8]:
#Identifying and Keeping Only One Instance of Duplicates:
duplicate = df_netflix[df_netflix.duplicated(["type","title","release_year"], keep = False)]

In [9]:
duplicate


Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
393,s3997,TV Show,9-Feb,Not Given,Pakistan,3/20/2019,2018,TV-14,1 Season,"International TV Shows, TV Dramas"
537,s5965,TV Show,9-Feb,Not Given,Pakistan,3/20/2019,2018,TV-14,1 Season,"International TV Shows, TV Dramas"
2925,s3963,Movie,15-Aug,Swapnaneel Jayakar,India,3/29/2019,2019,TV-14,124 min,"Comedies, Dramas, Independent Movies"
3285,s4523,Movie,22-Jul,Paul Greengrass,Norway,10/10/2018,2018,R,144 min,"Dramas, Thrillers"
4260,s5966,Movie,22-Jul,Paul Greengrass,Norway,10/10/2018,2018,R,144 min,"Dramas, Thrillers"
4261,s5967,Movie,15-Aug,Swapnaneel Jayakar,India,3/29/2019,2019,TV-14,124 min,"Comedies, Dramas, Independent Movies"


In [10]:
rows_keep = duplicate[["type","title","release_year"]].duplicated(keep='first')
rows_keep

393     False
537      True
2925    False
3285    False
4260     True
4261     True
dtype: bool

In [11]:
indexes_dropped = duplicate[~rows_keep].index
indexes_dropped

Int64Index([393, 2925, 3285], dtype='int64')

In [12]:
df_netflix = df_netflix.drop(indexes_dropped)

In [13]:
duplicate = df_netflix[df_netflix.duplicated(["type","title","release_year"], keep = False)]
duplicate

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in


In [14]:
#Converting Columns to DateTime and Extracting Year:
df_netflix['date_added'] = pd.to_datetime(df_netflix['date_added'], format='%m/%d/%Y')
df_netflix['release_year'] = pd.to_datetime(df_netflix['release_year'], format='%Y')
df_netflix['release_year'] = df_netflix['release_year'].dt.year

In [15]:
df_netflix

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,2021-09-25,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,2021-09-24,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,2021-09-22,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,2021-09-24,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"
...,...,...,...,...,...,...,...,...,...,...
8785,s8797,TV Show,Yunus Emre,Not Given,Turkey,2017-01-17,2016,TV-PG,2 Seasons,"International TV Shows, TV Dramas"
8786,s8798,TV Show,Zak Storm,Not Given,United States,2018-09-13,2016,TV-Y7,3 Seasons,Kids' TV
8787,s8801,TV Show,Zindagi Gulzar Hai,Not Given,Pakistan,2016-12-15,2012,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ..."
8788,s8784,TV Show,Yoko,Not Given,Pakistan,2018-06-23,2016,TV-Y,1 Season,Kids' TV


In [16]:
df_netflix.dtypes

show_id                 object
type                    object
title                   object
director                object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
dtype: object

In [17]:
#Checking DataFrame Information After Manipulations:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8787 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8787 non-null   object        
 1   type          8787 non-null   object        
 2   title         8787 non-null   object        
 3   director      8787 non-null   object        
 4   country       8787 non-null   object        
 5   date_added    8787 non-null   datetime64[ns]
 6   release_year  8787 non-null   int64         
 7   rating        8787 non-null   object        
 8   duration      8787 non-null   object        
 9   listed_in     8787 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 755.1+ KB
