**Imports**

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

**Load Dataset**

In [4]:
# Load the dataset with encoding to prevent UnicodeDecodeError
df = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')

# Display basic info and head (optional for debugging)
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Ro

**Data Cleaning**

In [5]:
# Make a copy for cleaning
df_clean = df.copy()

# Remove rows where Rating is missing (target variable)
df_clean = df_clean[df_clean['Rating'].notna()]

# Extract numeric year from 'Year' column
df_clean['Year'] = df_clean['Year'].str.extract(r'(\d{4})')
df_clean['Year'] = pd.to_numeric(df_clean['Year'], errors='coerce')

# Extract numeric duration (in minutes)
df_clean['Duration'] = df_clean['Duration'].str.extract(r'(\d+)')
df_clean['Duration'] = pd.to_numeric(df_clean['Duration'], errors='coerce')

# Convert Votes to numeric (remove commas)
df_clean['Votes'] = df_clean['Votes'].astype(str).str.replace(',', '', regex=False)
df_clean['Votes'] = pd.to_numeric(df_clean['Votes'], errors='coerce')

# Drop rows with important missing values
df_clean = df_clean.dropna(subset=['Genre', 'Director', 'Year', 'Duration', 'Votes'])


**Feature Engineering**

In [6]:
# Feature: Director average rating
director_avg_rating = df_clean.groupby('Director')['Rating'].mean().to_dict()
df_clean['Director_Avg_Rating'] = df_clean['Director'].map(director_avg_rating)

# Feature: Genre average rating
genre_avg_rating = df_clean.groupby('Genre')['Rating'].mean().to_dict()
df_clean['Genre_Avg_Rating'] = df_clean['Genre'].map(genre_avg_rating)

# Frequency encode categorical variables
for col in ['Director', 'Genre', 'Actor 1', 'Actor 2', 'Actor 3']:
    freq_encoding = df_clean[col].value_counts().to_dict()
    df_clean[col + '_Freq'] = df_clean[col].map(freq_encoding)


**Define Features and Target**

In [7]:
# Final feature set
features = [
    'Year', 'Duration', 'Votes',
    'Director_Avg_Rating', 'Genre_Avg_Rating',
    'Director_Freq', 'Genre_Freq',
    'Actor 1_Freq', 'Actor 2_Freq', 'Actor 3_Freq'
]

# Define X and y
X = df_clean[features].fillna(0)
y = df_clean['Rating']


**Train-Test Split**

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**Train Model**

In [10]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


**Evaluate Model**

In [11]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print("MAE :", round(mae, 3))
print("RMSE:", round(rmse, 3))
print("R²   :", round(r2, 3))


Model Performance:
MAE : 0.497
RMSE: 0.754
R²   : 0.697
