<a href="https://colab.research.google.com/github/priyanandini0512/movie-rating-predictor/blob/main/movie_rating_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv("IMDb Movies India.csv", encoding="latin1")

# Data Cleaning and Preprocessing
## Extract numeric values
df["Year"] = df["Year"].str.extract("(\d{4})").astype(float)
df["Duration"] = df["Duration"].str.extract("(\d+)").astype(float)
df["Votes"] = df["Votes"].str.replace("[^\d]", "", regex=True).astype(float)

## Fill missing values
df["Year"] = df["Year"].fillna(df["Year"].mode()[0])
df["Director"] = df["Director"].fillna("Unknown")
df["Duration"] = df.groupby("Year")["Duration"].transform(lambda x: x.fillna(x.median()))
df["Genre"] = df["Genre"].fillna("Unknown")
df["Actor 1"] = df["Actor 1"].fillna("Unknown")
df["Actor 2"] = df["Actor 2"].fillna("Unknown")
df["Actor 3"] = df["Actor 3"].fillna("Unknown")

# Handle missing ratings separately
df["Rating"] = df["Rating"].astype(float)

# Feature Engineering
## Director Success Rate
director_avg_rating = df.groupby("Director")["Rating"].mean().to_dict()
df["Director_Success"] = df["Director"].map(director_avg_rating)

## One-Hot Encoding for Genre
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
genre_encoded = ohe.fit_transform(df[["Genre"]])
genre_df = pd.DataFrame(genre_encoded, columns=ohe.get_feature_names_out(["Genre"]))
df = pd.concat([df, genre_df], axis=1)

# Selecting Features and Target Variables
features = ["Year", "Duration", "Votes", "Director_Success"] + list(genre_df.columns)
df = df[features + ["Rating"]].dropna()

X = df[features]
y = df["Rating"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 0.8145896803467627
