In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

In [2]:
data = pd.read_csv("IMDb Movies India.csv", encoding='iso-8859-1')

In [3]:
data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
data.tail()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11.0,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655.0,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,
15508,Zulm-O-Sitam,(1998),130 min,"Action, Drama",6.2,20.0,K.C. Bokadia,Dharmendra,Jaya Prada,Arjun Sarja


In [5]:
data.shape

(15509, 10)

In [6]:
data.dtypes

Name         object
Year         object
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object

In [7]:
data.isna().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [8]:
data = data.dropna(subset = ["Actor 1","Actor 2","Actor 3","Year","Genre","Director","Rating"])

In [9]:
data.isna().sum()

Name           0
Year           0
Duration    1899
Genre          0
Rating         0
Votes          0
Director       0
Actor 1        0
Actor 2        0
Actor 3        0
dtype: int64

In [10]:
# Extract the numeric part of the "Duration" column and convert it to integers
data["dur"] = data["Duration"].str.extract(r'(\d+)').astype(float)

In [11]:
data["dur"].describe()

count    5659.000000
mean      133.439124
std        25.319939
min        21.000000
25%       119.000000
50%       135.000000
75%       150.000000
max       321.000000
Name: dur, dtype: float64

In [12]:
data.drop(data[data["dur"] > 180].index, inplace=True)

In [13]:
data.drop(data[data["dur"] < 60].index,inplace = True)

In [14]:
data.shape

(7418, 11)

In [15]:
data.drop("Duration",axis = 1,inplace = True)

In [16]:
data.head()

Unnamed: 0,Name,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,dur
1,#Gadhvi (He thought he was Gandhi),(2019),Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,109.0
3,#Yaaram,(2019),"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,110.0
5,...Aur Pyaar Ho Gaya,(1997),"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.0
6,...Yahaan,(2005),"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,142.0
8,?: A Question Mark,(2012),"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,82.0


In [17]:
for year, group in data.groupby("Year"):
    data.loc[data["Year"] == year, "dur"] = group["dur"].fillna(group["dur"].mean())

In [18]:
data["dur"].fillna(data["dur"].mean(),inplace = True)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7418 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      7418 non-null   object 
 1   Year      7418 non-null   object 
 2   Genre     7418 non-null   object 
 3   Rating    7418 non-null   float64
 4   Votes     7418 non-null   object 
 5   Director  7418 non-null   object 
 6   Actor 1   7418 non-null   object 
 7   Actor 2   7418 non-null   object 
 8   Actor 3   7418 non-null   object 
 9   dur       7418 non-null   float64
dtypes: float64(2), object(8)
memory usage: 637.5+ KB


In [20]:
# Remove commas from the "Votes" column and convert to float
data["Votes"] = data["Votes"].str.replace(',', '').astype('float64')


In [21]:
data.dtypes

Name         object
Year         object
Genre        object
Rating      float64
Votes       float64
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dur         float64
dtype: object

In [22]:
data.isna().sum()

Name        0
Year        0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dur         0
dtype: int64

In [23]:
actor1 = data["Actor 1"].value_counts()

In [24]:
actor1_others = actor1[actor1 < 10]

In [25]:
actor1_others

Actor 1
Jaya Bachchan            9
Victor Banerjee          9
Shreyas Talpade          9
Nandamuri Balakrishna    9
Bipasha Basu             9
                        ..
Diana Penty              1
Milind Ukey              1
Dhiraj Verma             1
Maya Basu                1
Meghan Jadhav            1
Name: count, Length: 2266, dtype: int64

In [26]:
data["Actor 1"] = data["Actor 1"].apply(lambda x : "other" if x in actor1_others else x)

In [27]:
data[data["Actor 1"] == "other"]

Unnamed: 0,Name,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,dur
1,#Gadhvi (He thought he was Gandhi),(2019),Drama,7.0,8.0,Gaurav Bakshi,other,Vivek Ghamande,Arvind Jangid,109.000000
3,#Yaaram,(2019),"Comedy, Romance",4.4,35.0,Ovais Khan,other,Ishita Raj,Siddhant Kapoor,110.000000
8,?: A Question Mark,(2012),"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,other,Muntazir Ahmad,Kiran Bhatia,82.000000
9,@Andheri,(2014),"Action, Crime, Thriller",4.0,11.0,Biju Bhaskar Nair,other,Fathima Babu,Byon,116.000000
11,1:13:7 Ek Tera Saath,(2016),Horror,5.9,59.0,Arshad Siddiqui,other,Anubhav Dhir,Hritu Dudani,120.000000
...,...,...,...,...,...,...,...,...,...,...
15489,Zor Lagaa Ke... Haiya!,(2009),"Comedy, Drama, Family",6.1,59.0,Girish Girija Joshi,other,Mithun Chakraborty,Riya Sen,126.026316
15491,Zordaar,(1996),Action,4.7,29.0,Ajay Kashyap,other,Vikas Anand,Bob Christo,142.357143
15493,Zubaan,(2015),Drama,6.1,408.0,Mozez Singh,other,Sarah Jane Dias,Raaghavv Chanana,115.000000
15494,Zubeidaa,(2001),"Biography, Drama, History",6.2,1496.0,Shyam Benegal,other,Rekha,Manoj Bajpayee,153.000000


In [28]:
data["Actor 1"].value_counts()

Actor 1
other                 3721
Jeetendra              138
Dharmendra             131
Mithun Chakraborty     130
Ashok Kumar            123
                      ... 
Sharman Joshi           10
Randeep Hooda           10
Aditya Pancholi         10
Smita Patil             10
Abhi Bhattacharya       10
Name: count, Length: 125, dtype: int64

In [29]:

actor2 = data["Actor 2"].value_counts()
actor3 = data["Actor 3"].value_counts()

In [30]:

actor2_others = actor2[actor2 < 10]
actor3_others = actor3[actor3 < 10]

In [31]:

data["Actor 3"] = data["Actor 3"].apply(lambda x : "other" if x in actor3_others else x)


In [32]:
data["Actor 2"] = data["Actor 2"].apply(lambda x : "other" if x in actor2_others else x)

In [33]:
data["Actor 2"].value_counts()

Actor 2
other                 4437
Rekha                   78
Hema Malini             68
Mithun Chakraborty      56
Dharmendra              52
                      ... 
Preity Zinta            10
Kamini Kaushal          10
Bindiya Goswami         10
Deepti Naval            10
Aditya Pancholi         10
Name: count, Length: 149, dtype: int64

In [34]:
data.head()

Unnamed: 0,Name,Year,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,dur
1,#Gadhvi (He thought he was Gandhi),(2019),Drama,7.0,8.0,Gaurav Bakshi,other,other,other,109.0
3,#Yaaram,(2019),"Comedy, Romance",4.4,35.0,Ovais Khan,other,other,other,110.0
5,...Aur Pyaar Ho Gaya,(1997),"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.0
6,...Yahaan,(2005),"Drama, Romance, War",7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,other,other,142.0
8,?: A Question Mark,(2012),"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,other,other,other,82.0


In [35]:
data["Genre"].unique()

array(['Drama', 'Comedy, Romance', 'Comedy, Drama, Musical',
       'Drama, Romance, War', 'Horror, Mystery, Thriller',
       'Action, Crime, Thriller', 'Horror', 'Horror, Romance, Thriller',
       'Comedy, Drama, Romance', 'Comedy, Drama', 'Crime, Drama, Mystery',
       'Horror, Thriller', 'Comedy, Horror', 'Drama, Horror, Mystery',
       'Action, Thriller', 'Action', 'Drama, History',
       'Horror, Mystery, Romance', 'Horror, Mystery',
       'Drama, Horror, Romance', 'Action, Drama, History',
       'Action, Drama, War', 'Thriller', 'Comedy', 'Comedy, Family',
       'Adventure, Horror, Mystery', 'Action, Sci-Fi',
       'Crime, Mystery, Thriller', 'Sport', 'Horror, Romance',
       'Crime, Drama', 'Drama, Romance', 'Adventure, Drama',
       'Comedy, Mystery, Thriller', 'Action, Crime, Drama',
       'Crime, Thriller', 'Horror, Sci-Fi, Thriller',
       'Drama, Mystery, Thriller', 'Drama, Sport',
       'Drama, Family, Musical', 'Action, Comedy', 'Comedy, Thriller',
       'A

In [36]:
from collections import Counter
# Split genres and count frequencies
genre_list = [genre.split(', ') for genre in data["Genre"]]
flat_list = [item for sublist in genre_list for item in sublist]
genre_counts = Counter(flat_list)

# Get the 10 most common genres
top_genres = [genre for genre, count in genre_counts.most_common(10)]

In [37]:
top_genres

['Drama',
 'Action',
 'Romance',
 'Comedy',
 'Crime',
 'Thriller',
 'Family',
 'Musical',
 'Adventure',
 'Mystery']

In [38]:
def one_hot_encode_genres(row, top_genres):
    genres = row.split(', ')
    return pd.Series([1 if genre in genres else 0 for genre in top_genres], index=top_genres)


In [39]:
one_hot_df = data['Genre'].apply(one_hot_encode_genres, top_genres=top_genres)

In [40]:
one_hot_df.head()

Unnamed: 0,Drama,Action,Romance,Comedy,Crime,Thriller,Family,Musical,Adventure,Mystery
1,1,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,0,0,0,0
5,1,0,0,1,0,0,0,1,0,0
6,1,0,1,0,0,0,0,0,0,0
8,0,0,0,0,0,1,0,0,0,1


In [41]:
result_data = pd.concat([data,one_hot_df],axis = 1)

In [42]:
result_data.drop("Genre",axis = 1,inplace = True)

In [43]:
result_data.drop("Name",axis = 1,inplace = True)

In [44]:
result_data.head()

Unnamed: 0,Year,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,dur,Drama,Action,Romance,Comedy,Crime,Thriller,Family,Musical,Adventure,Mystery
1,(2019),7.0,8.0,Gaurav Bakshi,other,other,other,109.0,1,0,0,0,0,0,0,0,0,0
3,(2019),4.4,35.0,Ovais Khan,other,other,other,110.0,0,0,1,1,0,0,0,0,0,0
5,(1997),4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.0,1,0,0,1,0,0,0,1,0,0
6,(2005),7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,other,other,142.0,1,0,1,0,0,0,0,0,0,0
8,(2012),5.6,326.0,Allyson Patel,other,other,other,82.0,0,0,0,0,0,1,0,0,0,1


In [45]:
# Reset the index of the DataFrame
result_data = result_data.reset_index(drop=True)

# Display the DataFrame with reset indices
result_data


Unnamed: 0,Year,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,dur,Drama,Action,Romance,Comedy,Crime,Thriller,Family,Musical,Adventure,Mystery
0,(2019),7.0,8.0,Gaurav Bakshi,other,other,other,109.000000,1,0,0,0,0,0,0,0,0,0
1,(2019),4.4,35.0,Ovais Khan,other,other,other,110.000000,0,0,1,1,0,0,0,0,0,0
2,(1997),4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.000000,1,0,0,1,0,0,0,1,0,0
3,(2005),7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,other,other,142.000000,1,0,1,0,0,0,0,0,0,0
4,(2012),5.6,326.0,Allyson Patel,other,other,other,82.000000,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7413,(1992),5.3,135.0,Bharat Rangachary,Dharmendra,Moushumi Chatterjee,Govinda,146.560976,1,1,0,0,1,0,0,0,0,0
7414,(1989),5.8,44.0,S.P. Muthuraman,Chiranjeevi,other,other,125.000000,1,1,0,0,1,0,0,0,0,0
7415,(1988),4.6,11.0,Mahendra Shah,Naseeruddin Shah,other,other,144.230769,0,1,0,0,0,0,0,0,0,0
7416,(1999),4.5,655.0,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,129.000000,1,1,0,0,0,0,0,0,0,0


In [46]:
result_data.drop("Votes",axis = 1,inplace=True)

In [47]:
result_data.head()

Unnamed: 0,Year,Rating,Director,Actor 1,Actor 2,Actor 3,dur,Drama,Action,Romance,Comedy,Crime,Thriller,Family,Musical,Adventure,Mystery
0,(2019),7.0,Gaurav Bakshi,other,other,other,109.0,1,0,0,0,0,0,0,0,0,0
1,(2019),4.4,Ovais Khan,other,other,other,110.0,0,0,1,1,0,0,0,0,0,0
2,(1997),4.7,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.0,1,0,0,1,0,0,0,1,0,0
3,(2005),7.4,Shoojit Sircar,Jimmy Sheirgill,other,other,142.0,1,0,1,0,0,0,0,0,0,0
4,(2012),5.6,Allyson Patel,other,other,other,82.0,0,0,0,0,0,1,0,0,0,1


In [48]:
directors_data = result_data["Director"].value_counts()

In [49]:
directors_with_less_than_5_movies = directors_data[directors_data < 5]

In [50]:
directors_with_less_than_5_movies

Director
Anil Mattoo             4
Kumar Shahani           4
H.M. Mirza              4
Neeraj Pandey           4
Milind Ukey             4
                       ..
Sanjeev Kumar Rajput    1
Gautam Bhatia           1
Vinil Mathew            1
Rajesh Bajaj            1
Mozez Singh             1
Name: count, Length: 2573, dtype: int64

In [51]:
directors_data

Director
Mahesh Bhatt            45
David Dhawan            43
Hrishikesh Mukherjee    42
Shakti Samanta          38
Kanti Shah              37
                        ..
Sanjeev Kumar Rajput     1
Gautam Bhatia            1
Vinil Mathew             1
Rajesh Bajaj             1
Mozez Singh              1
Name: count, Length: 2926, dtype: int64

In [52]:
result_data["Director"] = result_data["Director"].apply(lambda x: "other" if x in directors_with_less_than_5_movies else x)

In [53]:
result_data["Director"].value_counts()

Director
other                      3712
Mahesh Bhatt                 45
David Dhawan                 43
Hrishikesh Mukherjee         42
Shakti Samanta               38
                           ... 
V.M. Vyas                     5
Jambulingam                   5
Sriram Raghavan               5
Apoorva Lakhia                5
Singeetam Srinivasa Rao       5
Name: count, Length: 354, dtype: int64

In [54]:
result_data.head()

Unnamed: 0,Year,Rating,Director,Actor 1,Actor 2,Actor 3,dur,Drama,Action,Romance,Comedy,Crime,Thriller,Family,Musical,Adventure,Mystery
0,(2019),7.0,other,other,other,other,109.0,1,0,0,0,0,0,0,0,0,0
1,(2019),4.4,other,other,other,other,110.0,0,0,1,1,0,0,0,0,0,0
2,(1997),4.7,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,147.0,1,0,0,1,0,0,0,1,0,0
3,(2005),7.4,Shoojit Sircar,Jimmy Sheirgill,other,other,142.0,1,0,1,0,0,0,0,0,0,0
4,(2012),5.6,other,other,other,other,82.0,0,0,0,0,0,1,0,0,0,1


In [55]:
X = result_data.drop(["Rating"],axis = 1)
y = result_data["Rating"]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [57]:
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ["Director", "Year", "Actor 1","Actor 2","Actor 3"]),
    (StandardScaler(), ["dur"]),
    remainder="passthrough"
)

In [58]:
model = RandomForestRegressor()

In [59]:
pipe = Pipeline(
    [("preprocessor",column_trans),
     ("dim_reduction",TruncatedSVD(n_components=200)),
    ("model",RandomForestRegressor())]
)

In [60]:
pipe

In [61]:
pipe.fit(X_train,y_train)

In [62]:
y_pred = pipe.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)


In [63]:
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 0.9573692692957687
Mean Squared Error: 1.5352030088843054


In [64]:
# Calculate the mean rating
mean_rating = np.mean(y_train)

# Predict the mean rating for all test instances
y_pred_baseline = np.full_like(y_test, mean_rating)

# Calculate baseline MSE and MAE
baseline_mse = mean_squared_error(y_test, y_pred_baseline)
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)

print(f"Baseline Mean Squared Error: {baseline_mse}")
print(f"Baseline Mean Absolute Error: {baseline_mae}")


Baseline Mean Squared Error: 1.8456178297959491
Baseline Mean Absolute Error: 1.0972498698622857
