In [1]:
import pandas as pd

In [2]:
# Step 1: Load dataset
df = pd.read_csv("netflix_titles.csv")

In [3]:
# Step 2: Show initial missing data summary
print("Initial Missing Data:\n", df.isnull().sum())

Initial Missing Data:
 show_id            0
type               0
title              0
director        1969
cast             570
country          476
date_added        11
release_year       0
rating            10
duration           0
listed_in          0
description        0
dtype: int64


In [4]:
# Step 3: Handle Missing Data
# Fill missing 'country' with 'Unknown'
df['country'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['country'].fillna('Unknown', inplace=True)


In [5]:
# Fill missing 'rating' with 'Not Rated'
df['rating'].fillna('Not Rated', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna('Not Rated', inplace=True)


In [6]:
# Drop rows where 'title' is missing (title is critical)
df.dropna(subset=['title'], inplace=True)


In [7]:
# Step 4: String Manipulation Examples

# Example 1: Convert all titles to lowercase
df['title_lower'] = df['title'].str.lower()

In [8]:
# Example 2: Extract year from 'date_added'
df['date_added'] = df['date_added'].astype(str).str.strip()
df['year_added'] = df['date_added'].str[-4:]

In [9]:
# Example 3: Check if the description contains the word 'love'
df['has_love'] = df['description'].str.contains('love', case=False, na=False)


In [10]:
# Example 4: Remove commas from 'listed_in' (genre field)
df['cleaned_genre'] = df['listed_in'].str.replace(',', '', regex=False)

In [11]:
# Example 5: Split 'director' names into lists
df['directors_list'] = df['director'].astype(str).str.split(',')

In [12]:
# Step 5: Final Preview
print("\nSample Cleaned Data:\n", df[['title', 'country', 'rating', 'title_lower', 'year_added', 'has_love']].head())


Sample Cleaned Data:
                                      title  \
0  Norm of the North: King Sized Adventure   
1               Jandino: Whatever it Takes   
2                       Transformers Prime   
3         Transformers: Robots in Disguise   
4                             #realityhigh   

                                    country    rating  \
0  United States, India, South Korea, China     TV-PG   
1                            United Kingdom     TV-MA   
2                             United States  TV-Y7-FV   
3                             United States     TV-Y7   
4                             United States     TV-14   

                               title_lower year_added  has_love  
0  norm of the north: king sized adventure       2019     False  
1               jandino: whatever it takes       2016     False  
2                       transformers prime       2018     False  
3         transformers: robots in disguise       2018     False  
4                          