In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset with a specified encoding
df = pd.read_csv('/content/IMDb Movies India.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataframe
print(df.head())
print(df.columns)

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [10]:
# Handle missing values

# Impute missing ratings with the mean rating
imputer = SimpleImputer(strategy='mean')
y = imputer.fit_transform(df['Rating'].values.reshape(-1, 1))[:, 0]

# Fill missing values in categorical features with 'Unknown'
df['Genre'].fillna('Unknown', inplace=True)
df['Director'].fillna('Unknown', inplace=True)
df['Actor 1'].fillna('Unknown', inplace=True)
df['Actor 2'].fillna('Unknown', inplace=True)
df['Actor 3'].fillna('Unknown', inplace=True)

# Simplify Genre, Director, and Actors columns
df['Genre'] = df['Genre'].apply(lambda x: x.split(',')[0])
df['Director'] = df['Director'].apply(lambda x: x.split(',')[0])
df['Actors'] = df['Actor 1'] + ',' + df['Actor 2'] + ',' + df['Actor 3']

# Define features and target variable
X = df[['Genre', 'Director', 'Actors']]

# Preprocessing pipeline for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('genre', OneHotEncoder(handle_unknown='ignore'), ['Genre']),
        ('director', OneHotEncoder(handle_unknown='ignore'), ['Director']),
        ('actors', OneHotEncoder(handle_unknown='ignore'), ['Actors']),
    ])

In [11]:
# Model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

Mean Absolute Error (MAE): 0.6029329854044595
Mean Squared Error (MSE): 0.9419704356253604
Root Mean Squared Error (RMSE): 0.97055161409652
R-squared (R2): 0.02130641030007685
