In [4]:
import pandas as pd

# Step 1: Load the data
df = pd.read_csv("netflix_titles.csv")

# Step 2: Check the first few rows
print(df.head())
print(df.info())

# Step 3: Clean 'country' and 'rating'
df['country'] = df['country'].fillna("Unknown")
df['rating'] = df['rating'].fillna("Unrated")

# Step 4: Clean 'date_added'
# Strip leading/trailing spaces and handle inconsistent formats
df['date_added'] = df['date_added'].str.strip()

# Convert to datetime format (let pandas infer the format)
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Step 5: Extract year from cleaned date
df['year_added'] = df['date_added'].dt.year
# df.to_csv("netflix_cleaned.csv", index=False)


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [2]:
import pandas as pd

df = pd.read_csv('netflix_cleaned.csv')

# Drop rows with missing countries or genres
df = df.dropna(subset=['country', 'listed_in'])

# Split countries and genres
df = df.assign(country=df['country'].str.split(', '))
df = df.assign(genre=df['listed_in'].str.split(', '))

# Explode both
df = df.explode('country')
df = df.explode('genre')

# Clean up leading/trailing commas and spaces
df['country'] = df['country'].str.strip(', ')
df['genre'] = df['genre'].str.strip(', ')

# Remove any blank rows (just in case)
df = df[(df['country'] != '') & (df['genre'] != '')]

df.to_csv('netflix_normalized_1.csv', index=False)
