In [2]:
#Task 2

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Load dataset with latin1 encoding
try:
    df = pd.read_csv('IMDb Movies India.csv', encoding='latin1')
except Exception as e:
    print(f"Error loading file: {e}")
    exit()

# Clean 'Year': Remove parentheses and convert to float
df['Year'] = df['Year'].str.replace(r'[()]', '', regex=True).astype(float, errors='ignore')

# Clean 'Duration': Remove ' min' and convert to float
df['Duration'] = df['Duration'].str.replace(' min', '').astype(float, errors='ignore')

# Clean 'Votes': Handle non-numeric values like '$5.16M'
def clean_votes(value):
    if pd.isna(value):
        return np.nan
    try:
        value = str(value).replace(',', '').replace('$', '').replace('M', '')
        return float(value)
    except (ValueError, TypeError):
        return np.nan

df['Votes'] = df['Votes'].apply(clean_votes)

# Impute missing values with median
df['Year'].fillna(df['Year'].median(), inplace=True)
df['Duration'].fillna(df['Duration'].median(), inplace=True)
df['Votes'].fillna(df['Votes'].median(), inplace=True)
df['Rating'].fillna(df['Rating'].median(), inplace=True)

df.drop(['Name', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], axis=1, inplace=True)

df['Genre'] = df['Genre'].str.split(', ').str[0].fillna('Unknown')

df = pd.get_dummies(df, columns=['Genre'], drop_first=True)

# Define features and target, split data
X = df.drop('Rating', axis=1)
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output results
print(f"Mean Squared Error: {mse:.3f}")
print(f"R² Score: {r2:.3f}")
print("\nFeature Importance:")
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print(importances.round(3))

Mean Squared Error: 0.706
R² Score: 0.274

Feature Importance:
              Feature  Importance
2               Votes       0.403
0                Year       0.250
1            Duration       0.191
8   Genre_Documentary       0.031
9         Genre_Drama       0.028
6        Genre_Comedy       0.018
7         Genre_Crime       0.011
18      Genre_Romance       0.010
13       Genre_Horror       0.010
5     Genre_Biography       0.007
21     Genre_Thriller       0.007
3     Genre_Adventure       0.007
15      Genre_Musical       0.005
22      Genre_Unknown       0.005
4     Genre_Animation       0.004
10       Genre_Family       0.004
11      Genre_Fantasy       0.003
16      Genre_Mystery       0.002
12      Genre_History       0.001
23          Genre_War       0.001
19       Genre_Sci-Fi       0.001
14        Genre_Music       0.000
20        Genre_Sport       0.000
17   Genre_Reality-TV       0.000
