<a href="https://colab.research.google.com/github/rasiq-gulzar/Encryptix/blob/main/movie_rating_model_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
df=pd.read_csv('/content/IMDb Movies India.csv',encoding='latin1')

In [4]:
df.shape

(15509, 10)

In [5]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [8]:
# Assuming df is your dataframe with the movie data
# Let's handle missing values first
df = df.drop(0)  # Drop the first row which seems to be problematic
df = df.reset_index(drop=True)

# Convert Year to numeric by extracting the year from the parentheses
df['Year'] = df['Year'].str.extract(r'\((\d{4})\)').astype(float)

# Convert Duration to numeric by extracting the minutes
df['Duration'] = df['Duration'].str.extract(r'(\d+)').astype(float)

# Convert Rating to float
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Convert Votes to numeric
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Fill missing values for Rating with the mean
df['Rating'].fillna(df['Rating'].mean(), inplace=True)

# Define features and target
X = df.drop('Rating', axis=1)
y = df['Rating']

# Identify categorical and numerical columns
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
numerical_cols = ['Year', 'Duration', 'Votes']

# Check for missing values in numerical columns and handle them
for col in numerical_cols:
    if X[col].isnull().sum() > 0:
        X[col].fillna(X[col].mean(), inplace=True)

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# Feature importance
feature_names = (
    numerical_cols +
    list(model.named_steps['preprocessor']
         .named_transformers_['cat']
         .named_steps['onehot']
         .get_feature_names_out(categorical_cols))
)

importances = model.named_steps['regressor'].feature_importances_
indices = np.argsort(importances)[::-1]

print("\nFeature Ranking:")
for i in range(min(10, len(feature_names))):
    try:
        print(f"{i+1}. {feature_names[indices[i]]} ({importances[indices[i]]:.4f})")
    except IndexError:
        # If there's an index error, we've run out of features
        break

# Function to predict rating for a new movie
def predict_movie_rating(new_movie):
    """
    Predict the rating for a new movie.

    Parameters:
    new_movie (dict): Dictionary with movie details

    Returns:
    float: Predicted rating
    """
    # Convert the dictionary to a DataFrame
    new_movie_df = pd.DataFrame([new_movie])

    # Make prediction
    predicted_rating = model.predict(new_movie_df)[0]

    return predicted_rating



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(df['Rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(X[col].mean(), inplace=True)


Mean Squared Error: 0.6564
Mean Absolute Error: 0.4493
R² Score: 0.3326

Feature Ranking:
1. Votes (0.1567)
2. Year (0.1271)
3. Duration (0.0918)
4. Genre_Drama (0.0181)
5. Actor 3_Pran (0.0172)
6. Genre_Documentary (0.0111)
7. Genre_Action, Crime, Drama (0.0038)
8. Actor 2_Hema Malini (0.0038)
9. Genre_Horror (0.0037)
10. Genre_Comedy (0.0032)


In [9]:
new_movie = {
    'Year': 2023,
    'Duration': 120,
    'Genre': 'Action',
    'Votes': 0,
    'Director': 'Christopher Nolan',
    'Actor 1': 'Tom Hardy',
    'Actor 2': 'Cillian Murphy',
    'Actor 3': 'Florence Pugh'
}
predicted_rating = predict_movie_rating(new_movie)
print(f"Predicted Rating: {predicted_rating:.2f}")

Predicted Rating: 6.81
