In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Load the dataset with the appropriate encoding
data = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin1')
data

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [None]:
# Update the column names as needed
data.rename(columns={'Director': 'director', 'Actor 1': 'actors', 'Genre': 'genre', 'Rating': 'rating'}, inplace=True)

In [None]:
# Data Preprocessing
# Handle missing values in the 'rating' column
data.dropna(subset=['rating'], inplace=True)

In [None]:
# Convert 'rating' column to a numerical data type (e.g., float)
data['rating'] = data['rating'].astype(float)
data['rating']


1        7.0
3        4.4
5        4.7
6        7.4
8        5.6
        ... 
15501    5.3
15503    5.8
15504    4.6
15505    4.5
15508    6.2
Name: rating, Length: 7919, dtype: float64

In [None]:
# Encode categorical features into numerical values
label_encoder = LabelEncoder()
data['director'] = label_encoder.fit_transform(data['director'])
data['director']

1         811
3        1749
5        2005
6        2643
8         174
         ... 
15501     500
15503    2339
15504    1418
15505    1338
15508    1171
Name: director, Length: 7919, dtype: int64

In [None]:
data['actors'] = label_encoder.fit_transform(data['actors'])
data['actors']

1        1782
3        1589
5         508
6         931
8        2520
         ... 
15501     631
15503     550
15504    1405
15505     145
15508     631
Name: actors, Length: 7919, dtype: int64

In [None]:
data['genre'] = label_encoder.fit_transform(data['genre'])
data['genre']

1        268
3        207
5        177
6        331
8        367
        ... 
15501     28
15503     28
15504      0
15505     38
15508     38
Name: genre, Length: 7919, dtype: int64

In [None]:
# Filter out rows with large or invalid 'rating' values (e.g., consider a reasonable range)
data = data[(data['rating'] >= 0) & (data['rating'] <= 10)]
data

Unnamed: 0,Name,Year,Duration,genre,rating,Votes,director,actors,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,268,7.0,8,811,1782,Vivek Ghamande,Arvind Jangid
3,#Yaaram,(2019),110 min,207,4.4,35,1749,1589,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,(1997),147 min,177,4.7,827,2005,508,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,(2005),142 min,331,7.4,1086,2643,931,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,(2012),82 min,367,5.6,326,174,2520,Muntazir Ahmad,Kiran Bhatia
...,...,...,...,...,...,...,...,...,...,...
15501,Zulm Ki Hukumat,(1992),,28,5.3,135,500,631,Moushumi Chatterjee,Govinda
15503,Zulm Ki Zanjeer,(1989),125 min,28,5.8,44,2339,550,Jayamalini,Rajinikanth
15504,Zulm Ko Jala Doonga,(1988),,0,4.6,11,1418,1405,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,38,4.5,655,1338,145,Twinkle Khanna,Aruna Irani


In [None]:

# Split the data into features (X) and target (y)
X = data[['director', 'actors', 'genre']]
X

Unnamed: 0,director,actors,genre
1,811,1782,268
3,1749,1589,207
5,2005,508,177
6,2643,931,331
8,174,2520,367
...,...,...,...
15501,500,631,28
15503,2339,550,28
15504,1418,1405,0
15505,1338,145,38


In [None]:
y = data['rating']
y

1        7.0
3        4.4
5        4.7
6        7.4
8        5.6
        ... 
15501    5.3
15503    5.8
15504    4.6
15505    4.5
15508    6.2
Name: rating, Length: 7919, dtype: float64

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build and train the regression model (XGBoost in this case)
model = XGBRegressor()
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred = model.predict(X_test)
y_pred

array([4.475114 , 6.6206975, 4.9649763, ..., 4.603895 , 5.8613634,
       6.2016597], dtype=float32)

In [None]:
# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.6838765008494592
