<a href="https://colab.research.google.com/github/solaiprakashv/ELEVATELABS/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Import libraries
import pandas as pd
from google.colab import files

# Step 2: Upload the dataset
uploaded = files.upload()

# After upload, the filename will be the key of 'uploaded'
file_path = list(uploaded.keys())[0]

# Step 3: Load dataset
df = pd.read_csv(file_path)

# 1. Identify missing values
print("Missing Values Before Cleaning:")
print(df.isnull().sum())

# 2. Remove duplicate rows
df_cleaned = df.drop_duplicates()

# 3. Standardize text values (convert country names to lowercase)
df_cleaned['country'] = df_cleaned['country'].str.lower()

# 4. Convert date formats to consistent type
df_cleaned['date_added'] = pd.to_datetime(
    df_cleaned['date_added'], errors='coerce', format='%B %d, %Y'
)

# 5. Rename column headers (lowercase, replace spaces with underscores)
df_cleaned.columns = df_cleaned.columns.str.strip().str.lower().str.replace(' ', '_')

# 6. Fix data types (release_year as int)
df_cleaned['release_year'] = df_cleaned['release_year'].astype('Int64')

# Optional: Fill missing values
df_cleaned['country'] = df_cleaned['country'].fillna('unknown')
df_cleaned['director'] = df_cleaned['director'].fillna('unknown')
df_cleaned['cast'] = df_cleaned['cast'].fillna('unknown')
df_cleaned['rating'] = df_cleaned['rating'].fillna('Not Rated')
df_cleaned['duration'] = df_cleaned['duration'].fillna('unknown')

# Check missing values after cleaning
print("\nMissing Values After Cleaning:")
print(df_cleaned.isnull().sum())

# Display cleaned dataset info
print("\nCleaned Dataset Info:")
print(df_cleaned.info())

# Step 4: Save cleaned dataset
output_path = "netflix_titles_cleaned.csv"
df_cleaned.to_csv(output_path, index=False)

# Step 5: Download cleaned dataset
files.download(output_path)
print(f"\n✅ Cleaned dataset saved and ready to download: {output_path}")


Saving netflix_titles.csv to netflix_titles.csv
Missing Values Before Cleaning:
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

Missing Values After Cleaning:
show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      98
release_year     0
rating           0
duration         0
listed_in        0
description      0
dtype: int64

Cleaned Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8807 non-null   object        
 1   type          8807 non-null   object        
 2   title         8807 non-null   object        
 3

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Cleaned dataset saved and ready to download: netflix_titles_cleaned.csv
