In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression

# Load the dataset
file_path = '/content/IMDb_Movies_India.csv'
movie_data = pd.read_csv(file_path, encoding='latin1')

In [5]:
movie_data.head()

Unnamed: 0,Year,Duration,Rating,Votes,Action,Adventure,Animation,Biography,Comedy,Crime,...,Sci-Fi,Sport,Thriller,Unknown,War,Western,Director_Freq,Actor 1_Freq,Actor 2_Freq,Actor 3_Freq
1,2019.0,109.0,7.0,8.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,1,1
3,2019.0,110.0,4.4,35.0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,5,1,2
5,1997.0,147.0,4.7,827.0,0,0,0,0,1,0,...,0,0,0,0,0,0,17,18,15,13
6,2005.0,142.0,7.4,1086.0,0,0,0,0,0,0,...,0,0,0,0,1,0,7,25,4,8
8,2012.0,82.0,5.6,326.0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,1,6,1


In [2]:
# Initial data cleaning
movie_data.columns = movie_data.columns.str.strip()
movie_data['Year'] = movie_data['Year'].str.extract(r'(\d{4})').astype(float)
movie_data['Duration'] = movie_data['Duration'].str.extract(r'(\d+)').astype(float)
movie_data['Votes'] = movie_data['Votes'].astype(str)
movie_data['Votes'] = movie_data['Votes'].str.replace(',', '').apply(
    lambda x: float(x) if x.replace('.', '', 1).isdigit() else np.nan
)
movie_data = movie_data.dropna(subset=['Rating', 'Votes'])
movie_data['Duration'] = movie_data['Duration'].fillna(movie_data['Duration'].median())

categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
movie_data[categorical_columns] = movie_data[categorical_columns].fillna('Unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data['Duration'] = movie_data['Duration'].fillna(movie_data['Duration'].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data[categorical_columns] = movie_data[categorical_columns].fillna('Unknown')


In [3]:
# Feature Encoding
genres_split = movie_data['Genre'].str.get_dummies(sep=', ')
movie_data = pd.concat([movie_data, genres_split], axis=1)

for col in ['Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    freq_encoding = movie_data[col].value_counts().to_dict()
    movie_data[col + '_Freq'] = movie_data[col].map(freq_encoding)

movie_data = movie_data.drop(['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Name'], axis=1)

In [6]:
movie_data.head()

Unnamed: 0,Year,Duration,Rating,Votes,Action,Adventure,Animation,Biography,Comedy,Crime,...,Sci-Fi,Sport,Thriller,Unknown,War,Western,Director_Freq,Actor 1_Freq,Actor 2_Freq,Actor 3_Freq
1,2019.0,109.0,7.0,8.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,1,1
3,2019.0,110.0,4.4,35.0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,5,1,2
5,1997.0,147.0,4.7,827.0,0,0,0,0,1,0,...,0,0,0,0,0,0,17,18,15,13
6,2005.0,142.0,7.4,1086.0,0,0,0,0,0,0,...,0,0,0,0,1,0,7,25,4,8
8,2012.0,82.0,5.6,326.0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,1,6,1


In [7]:
# Handle remaining missing values in frequency-encoded columns
freq_columns = ['Director_Freq', 'Actor 1_Freq', 'Actor 2_Freq', 'Actor 3_Freq']
movie_data[freq_columns] = movie_data[freq_columns].fillna(0)

In [8]:
# Split data into training and testing sets
X = movie_data.drop(['Rating'], axis=1)
y = movie_data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Train a Linear Regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

In [11]:
# Predict and evaluate the Linear Regression model
y_pred = linear_reg_model.predict(X_test)
# Calculate MSE first
mse = mean_squared_error(y_test, y_pred)
# Then calculate RMSE
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [13]:
# Feature Optimization using SelectKBest
selector = SelectKBest(score_func=f_regression, k=15)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
selected_features = X_train.columns[selector.get_support()]

optimized_linear_reg_model = LinearRegression()
optimized_linear_reg_model.fit(X_train_selected, y_train)
y_pred_optimized = optimized_linear_reg_model.predict(X_test_selected)

# Calculate MSE first, then RMSE
mse_optimized = mean_squared_error(y_test, y_pred_optimized)
rmse_optimized = np.sqrt(mse_optimized)  # Calculate RMSE from MSE
# rmse_optimized = mean_squared_error(y_test, y_pred_optimized, squared=False) # This line caused the error
r2_optimized = r2_score(y_test, y_pred_optimized)

In [17]:
import numpy as np
random_movie_index = np.random.choice(X_test.index)
random_movie_features = X_test.loc[random_movie_index].to_frame().T
predicted_rating = linear_reg_model.predict(random_movie_features)[0]

actual_rating = y_test.loc[random_movie_index]

print(f"Predicted Rating: {predicted_rating}")
print(f"Actual Rating: {actual_rating}")


Predicted Rating: 5.352550556021519
Actual Rating: 5.5


In [18]:
# Results
print("Linear Regression Model")
print("RMSE:", rmse)
print("R^2:", r2)
print("\nOptimized Linear Regression Model")
print("RMSE:", rmse_optimized)
print("R^2:", r2_optimized)
print("\nRandom Movie Prediction")
print("Predicted Rating:", predicted_rating)
print("Actual Rating:", actual_rating)

Linear Regression Model
RMSE: 1.227771761573215
R^2: 0.18918460664366554

Optimized Linear Regression Model
RMSE: 1.2361001940239462
R^2: 0.17814717298619986

Random Movie Prediction
Predicted Rating: 5.352550556021519
Actual Rating: 5.5
