In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [35]:
file_path = '/mnt/data/IMDb Movies India.csv'
data = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin1')

In [36]:
data = data.dropna(subset=['Rating'])

In [37]:
data = data.dropna(subset=['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'])

In [38]:
data['Year'] = data['Year'].str.extract('(\d{4})').astype(float)
data['Duration'] = data['Duration'].str.extract('(\d+)').astype(float)

In [39]:
data['Votes'] = data['Votes'].str.replace(',', '').astype(float)

In [40]:
genres = data['Genre'].str.get_dummies(sep=',')
data = data.join(genres).drop('Genre', axis=1)

In [41]:
for col in ['Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    mean_ratings = data.groupby(col)['Rating'].mean()
    data[col] = data[col].map(mean_ratings)

In [42]:
data = data.fillna(data.mean(numeric_only=True))

In [43]:
X = data.drop(['Name', 'Rating'], axis=1)
y = data['Rating']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [46]:
ridge = Ridge()
param_grid = {'alpha': [0.1, 1.0, 10.0]}
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [47]:
best_ridge = grid_search.best_estimator_

In [48]:
y_pred = best_ridge.predict(X_test)

In [49]:
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
rmse

0.6717782132138951