<a href="https://colab.research.google.com/github/sangamithraD/Movie_Rating_Prediction_AIML/blob/main/MOVIE_RATEING_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline



# Step 1: Load the dataset

df = pd.read_csv("drive/MyDrive/IMDb Movies India.csv", encoding='latin1')



# Step 2: Select relevant columns

df = df[['Year', 'Duration', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Rating']]



# Step 3: Clean the 'Year' column

df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})')

df['Year'] = pd.to_numeric(df['Year'], errors='coerce')



# Step 4: Clean the 'Duration' column (extract only numbers)

df['Duration'] = df['Duration'].astype(str).str.extract(r'(\d+)')

df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')



# Step 5: Drop rows with any missing values

df.dropna(inplace=True)



# Step 6: Define features and target

X = df[['Year', 'Duration', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]

y = df['Rating']



# Step 7: Set categorical columns

categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']



# Step 8: Column transformer for encoding

preprocessor = ColumnTransformer(

    transformers=[

        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)

    ],

    remainder='passthrough'  # Let numeric columns through

)



# Step 9: Create pipeline

pipeline = Pipeline(steps=[

    ('preprocessor', preprocessor),

    ('model', LinearRegression())

])



# Step 10: Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Step 11: Train the model

pipeline.fit(X_train, y_train)



# Step 12: Evaluate the model

y_pred = pipeline.predict(X_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

print("R^2 Score:", r2_score(y_test, y_pred))



# Step 13: Predict rating for a new movie

print("\nEnter details of the movie to predict rating:")

year = int(input("Year of release (e.g., 2023): "))

duration = int(input("Duration in minutes (e.g., 120): "))

genre = input("Genre (e.g., Drama): ")

director = input("Director's name: ")

actor1 = input("Actor 1 name: ")

actor2 = input("Actor 2 name (or 'None'): ")

actor3 = input("Actor 3 name (or 'None'): ")



# Handle 'None' as missing values

actor2 = None if actor2.strip().lower() == 'none' else actor2

actor3 = None if actor3.strip().lower() == 'none' else actor3



# Build input DataFrame

new_movie = pd.DataFrame([{

    'Year': year,

    'Duration': duration,

    'Genre': genre,

    'Director': director,

    'Actor 1': actor1,

    'Actor 2': actor2,

    'Actor 3': actor3

}])



# Predict rating

predicted_rating = pipeline.predict(new_movie)[0]

print(f"\nPredicted Movie Rating: {predicted_rating:.2f}")

FileNotFoundError: [Errno 2] No such file or directory: 'drive/MyDrive/IMDb Movies India.csv'