1.Load the Dataset

In [1]:
import pandas as pd
df = pd.read_csv("netflix_titles.csv")
print(df.shape)
print(df.info())

(8807, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None


2.Identify missing values

In [2]:
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


3.Handle Missing Data

a) For critical columns:

In [3]:
df.dropna(subset=['director','country','rating','duration'], inplace=True)

b) For less-critical columns:

In [4]:
df['cast'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna('Unknown', inplace=True)


c) Alternatively, fill missing values with a mode or placeholder:

In [5]:
df['country'].fillna(df['country'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['country'].fillna(df['country'].mode()[0], inplace=True)


4. Remove Duplicates

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.drop_duplicates(subset='title', keep='first', inplace=True)

5. Clean & Transform Columns

In [8]:
df['show_id'] = df['show_id'].str.replace('s','')

In [9]:
df['cast'] = df['cast'].str.split(',').str[0]

6. Standardize Date Formats

In [11]:
df['date_added'] = pd.to_datetime(df['date_added'].str.strip(), format='mixed', dayfirst=False, errors='coerce')

7. Convert Data Types

In [12]:
df['release_year'] = df['release_year'].astype(int)
# To get numeric duration:
def get_duration(x):
    return int(x.split()[0])
df['duration_int'] = df['duration'].apply(get_duration)

8. Rename Columns

In [13]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

9.  Save the Cleaned Dataset

In [14]:
df.to_csv("netflix_cleaned.csv", index=False)

In [15]:
import pandas as pd
df = pd.read_csv("/content/netflix_cleaned.csv")
print(df.shape)
print(df.info())

(5747, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5747 entries, 0 to 5746
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       5747 non-null   int64 
 1   type          5747 non-null   object
 2   title         5747 non-null   object
 3   director      5747 non-null   object
 4   cast          5747 non-null   object
 5   country       5747 non-null   object
 6   date_added    5747 non-null   object
 7   release_year  5747 non-null   int64 
 8   rating        5747 non-null   object
 9   duration      5747 non-null   object
 10  listed_in     5747 non-null   object
 11  description   5747 non-null   object
 12  duration_int  5747 non-null   int64 
dtypes: int64(3), object(10)
memory usage: 583.8+ KB
None
