# Indian Movies Dataset Cleaning 

## 1. Load Library and Data

In [2]:
import pandas as pd

In [4]:
# Load the dataframe
# Make sure 'indian_movies.csv' is in the same directory as the notebook
df = pd.read_csv('/home/nineleaps/Documents/Python_Training/Final_test/movies_data/raw_data/indian_movies.csv')

## 2. Initial Data Inspection

In [5]:
print("--- Initial Data ---")
# Display the first 5 rows
print("\nFirst 5 rows (Initial):")
print(df.head())

# Print initial info
print("\nInitial DataFrame Info:")
df.info()

# Print initial missing values
print("\nInitial Missing values per column:")
print(df.isna().sum().sort_values(ascending=False))

--- Initial Data ---

First 5 rows (Initial):
   Unnamed: 0 MovieID                       Title  \
0           7  MOV008              Sarkar (Tamil)   
1           9  MOV010                 2.0 (Tamil)   
2          21  MOV022            K.G.F: Chapter 1   
3          23  MOV024  Avengers: Endgame (Dubbed)   
4          25  MOV026                    Pailwaan   

                      Director                   Genre  ReleaseYear  \
0             A. R. Murugadoss         Action Thriller       2005.0   
1                   S. Shankar  Science Fiction Action       2018.0   
2               Prashanth Neel            Action Drama       2018.0   
3  Anthony Russo, Joseph Russo               Superhero       2019.0   
4                   S. Krishna           Sports Action       2019.0   

   Budget (Crores)  BoxOffice (Crores)  Rating  Duration (minutes)  \
0              NaN                 NaN     7.5               173.0   
1            550.0               800.0     6.9               147.0  

## 3. Data Cleaning Steps

### 3.1 Drop Redundant Column

In [6]:
# Drop the 'Unnamed: 0' column if it exists
if 'Unnamed: 0' in df.columns:
    print("\n1. Dropping 'Unnamed: 0' column...")
    # Use inplace=False to return a new DataFrame (good practice)
    df = df.drop('Unnamed: 0', axis=1)
else:
    print("\n1. 'Unnamed: 0' column not found.")


1. Dropping 'Unnamed: 0' column...


### 3.2 Impute Missing Values

In [7]:
# Handle Missing Numerical Values using Median Imputation
print("\n2. Imputing missing values with median...")
# List of columns to impute
cols_to_impute = ['Budget (Crores)', 'BoxOffice (Crores)', 'Rating', 'Duration (minutes)', 'ReleaseYear']
for col in cols_to_impute:
    # Check if the column exists and has missing values
    if col in df.columns and df[col].isna().any():
        # Calculate the median for the column
        median_val = df[col].median()
        # Fill missing values (NaN) with the median
        df[col] = df[col].fillna(median_val) # Use assignment instead of inplace=True
        print(f"   - Missing values in '{col}' imputed with median: {median_val}")


2. Imputing missing values with median...
   - Missing values in 'Budget (Crores)' imputed with median: 25.0
   - Missing values in 'BoxOffice (Crores)' imputed with median: 47.5
   - Missing values in 'Rating' imputed with median: 7.1
   - Missing values in 'Duration (minutes)' imputed with median: 156.0
   - Missing values in 'ReleaseYear' imputed with median: 2019.0


### 3.3 Convert Data Types

In [8]:
# Convert Data Types (after imputation)
print("\n3. Converting data types...")
try:
    # Convert ReleaseYear to integer
    if 'ReleaseYear' in df.columns and df['ReleaseYear'].dtype != 'int64':
        df['ReleaseYear'] = df['ReleaseYear'].astype(int)
        print("   - 'ReleaseYear' converted to integer.")
    # Convert Duration to integer
    if 'Duration (minutes)' in df.columns and df['Duration (minutes)'].dtype != 'int64':
        df['Duration (minutes)'] = df['Duration (minutes)'].astype(int)
        print("   - 'Duration (minutes)' converted to integer.")
except Exception as e:
    print(f"   - Error during type conversion: {e}")


3. Converting data types...
   - 'ReleaseYear' converted to integer.
   - 'Duration (minutes)' converted to integer.


### 3.4 Handle Duplicates

In [13]:
print("\n4. Handling duplicate rows based on title...")
initial_rows = df.shape[0]

# Drop duplicates where 'title' is the same
df = df.drop_duplicates(subset='Title')
rows_after_duplicates = df.shape[0]
duplicates_removed = initial_rows - rows_after_duplicates

if duplicates_removed > 0:
    print(f"   - Removed {duplicates_removed} duplicate row(s) based on title.")
else:
    print("   - No duplicate rows found based on title.")


4. Handling duplicate rows based on title...
   - Removed 1 duplicate row(s) based on title.


## 4. Verification

In [14]:
print("\n--- Cleaned Data ---")

# Verify no more missing values
print("\nMissing values after cleaning:")
print(df.isna().sum())

# Display info of the cleaned dataframe
print("\nCleaned DataFrame Info:")
df.info()

# Display the first 5 rows of the cleaned dataframe
print("\nFirst 5 rows (Cleaned):")
print(df.head())

# Display the shape of the cleaned dataframe
print("\nShape of the cleaned DataFrame (rows, columns):")
print(df.shape)


--- Cleaned Data ---

Missing values after cleaning:
MovieID               0
Title                 0
Director              0
Genre                 0
ReleaseYear           0
Budget (Crores)       0
BoxOffice (Crores)    0
Rating                0
Duration (minutes)    0
LeadActor             0
LeadActress           0
Language              0
ProductionCompany     0
dtype: int64

Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 17 entries, 0 to 17
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MovieID             17 non-null     object 
 1   Title               17 non-null     object 
 2   Director            17 non-null     object 
 3   Genre               17 non-null     object 
 4   ReleaseYear         17 non-null     int64  
 5   Budget (Crores)     17 non-null     float64
 6   BoxOffice (Crores)  17 non-null     float64
 7   Rating              17 non-null     float64
 8   Dur

## 5. Save Cleaned Data 

In [15]:
df.to_csv('/home/nineleaps/Documents/Python_Training/Final_test/movies_data/cleaned_data_csv/cleaned_indian_movies_simple_detailed.csv', index=False)