In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/indian_movie_data.csv', encoding='latin-1')

In [4]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,-1988.0,,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,-1999.0,129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,-2005.0,,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,-1988.0,,Action,,,,,,


#EDA

We will visualize our data with more interactive and hovering kind of chart using library *plotly express*

In [5]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
df.describe()

Unnamed: 0,Year,Rating
count,14981.0,7919.0
mean,-1987.012215,5.841621
std,25.416689,1.381777
min,-2022.0,1.1
25%,-2009.0,4.9
50%,-1991.0,6.0
75%,-1968.0,6.8
max,-1913.0,10.0


In [7]:
fig = px.histogram(df,
                   x='Rating',
                   marginal='box',
                   nbins=25,
                   title='Distribution of Ratings')
fig.update_layout(bargap=0.1)
fig.show()

The distribution seems like a normal curve, with 6-7 being the highest frequency for ratings

####There is not much to visualize right now, but lets clean our data and visualize it then.

# Data cleaning / pre processing

In [8]:
df.isna().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

* We observe that there are about 8200 nan values for the 'Duration' column, this might significantly hamper the exploratory data analysis as well as training of our model even we use multiple imputation strategies.
* Similar is the case with 'Votes' column, wherein there are 7500 null values.
* We might have to come up with strategies to train our model on the basis of other non-numeric features present in the dataset.
--------------------------------------------------------------------------------
* **Most importantly**, our target variable itself has about 7500 nan values.
* In this situation, we are left with 2 options.
1.) Drop the entire rows containing nan values for the target variable.

OR

2.) Impute the class values to the mean, median or mode, etc.



### Approach 1: Dropping the nan rows

In [9]:
raw_df = df.dropna(subset=['Year','Genre','Rating','Votes','Duration','Director','Actor 1','Actor 2','Actor 3']).reset_index(drop=True)

In [10]:
raw_df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
1,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
2,...Aur Pyaar Ho Gaya,-1997.0,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
3,...Yahaan,-2005.0,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
4,?: A Question Mark,-2012.0,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
...,...,...,...,...,...,...,...,...,...,...
5654,Zubaan,-2015.0,115 min,Drama,6.1,408,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana
5655,Zubeidaa,-2001.0,153 min,"Biography, Drama, History",6.2,1496,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee
5656,Zulm Ki Zanjeer,-1989.0,125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
5657,Zulmi,-1999.0,129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani


In [11]:
raw_df.dtypes

Name         object
Year        float64
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object

In [12]:
raw_df['Votes'] = raw_df['Votes'].str.replace(',','')
raw_df['Votes'] = raw_df['Votes'].astype(int)

###Label encoding

In [13]:
raw_df['Duration'] = raw_df['Duration'].astype('category')
raw_df['Duration_cat'] = raw_df['Duration'].cat.codes
raw_df.head(10)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Duration_cat
0,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,9
1,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,10
2,...Aur Pyaar Ho Gaya,-1997.0,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,47
3,...Yahaan,-2005.0,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,42
4,?: A Question Mark,-2012.0,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,156
5,@Andheri,-2014.0,116 min,"Action, Crime, Thriller",4.0,11,Biju Bhaskar Nair,Augustine,Fathima Babu,Byon,16
6,1:1.6 An Ode to Lost Love,-2004.0,96 min,Drama,6.2,17,Madhu Ambat,Rati Agnihotri,Gulshan Grover,Atul Kulkarni,170
7,1:13:7 Ek Tera Saath,-2016.0,120 min,Horror,5.9,59,Arshad Siddiqui,Pankaj Berry,Anubhav Dhir,Hritu Dudani,20
8,100 Days,-1991.0,161 min,"Horror, Romance, Thriller",6.5,983,Partho Ghosh,Jackie Shroff,Madhuri Dixit,Javed Jaffrey,61
9,100% Love,-2012.0,166 min,"Comedy, Drama, Romance",5.7,512,Rabi Kinagi,Jeet,Koyel Mallick,Sujoy Ghosh,66


In [14]:
raw_df['Genre'] = raw_df['Genre'].astype('category')
raw_df['Genre_cat'] = raw_df['Genre'].cat.codes
raw_df.head(10)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Duration_cat,Genre_cat
0,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,9,229
1,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,10,184
2,...Aur Pyaar Ho Gaya,-1997.0,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,47,157
3,...Yahaan,-2005.0,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,42,289
4,?: A Question Mark,-2012.0,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,156,320
5,@Andheri,-2014.0,116 min,"Action, Crime, Thriller",4.0,11,Biju Bhaskar Nair,Augustine,Fathima Babu,Byon,16,37
6,1:1.6 An Ode to Lost Love,-2004.0,96 min,Drama,6.2,17,Madhu Ambat,Rati Agnihotri,Gulshan Grover,Atul Kulkarni,170,229
7,1:13:7 Ek Tera Saath,-2016.0,120 min,Horror,5.9,59,Arshad Siddiqui,Pankaj Berry,Anubhav Dhir,Hritu Dudani,20,316
8,100 Days,-1991.0,161 min,"Horror, Romance, Thriller",6.5,983,Partho Ghosh,Jackie Shroff,Madhuri Dixit,Javed Jaffrey,61,323
9,100% Love,-2012.0,166 min,"Comedy, Drama, Romance",5.7,512,Rabi Kinagi,Jeet,Koyel Mallick,Sujoy Ghosh,66,159


In [15]:
raw_df['Director'] = raw_df['Director'].astype('category')
raw_df['Director_cat'] = raw_df['Director'].cat.codes
raw_df.head(10)
raw_df['Actor 1'] = raw_df['Actor 1'].astype('category')
raw_df['Actor1_cat'] = raw_df['Actor 1'].cat.codes
raw_df.head(10)
raw_df['Actor 2'] = raw_df['Actor 2'].astype('category')
raw_df['Actor2_cat'] = raw_df['Actor 2'].cat.codes
raw_df.head(10)
raw_df['Actor 3'] = raw_df['Actor 3'].astype('category')
raw_df['Actor3_cat'] = raw_df['Actor 3'].cat.codes
raw_df.head(10)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Duration_cat,Genre_cat,Director_cat,Actor1_cat,Actor2_cat,Actor3_cat
0,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,9,229,629,1352,2272,319
1,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,10,184,1335,1198,719,2148
2,...Aur Pyaar Ho Gaya,-1997.0,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,47,157,1530,378,75,2045
3,...Yahaan,-2005.0,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,42,289,2044,692,1112,2524
4,?: A Question Mark,-2012.0,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,156,320,135,1934,1175,1013
5,@Andheri,-2014.0,116 min,"Action, Crime, Thriller",4.0,11,Biju Bhaskar Nair,Augustine,Fathima Babu,Byon,16,37,401,302,567,482
6,1:1.6 An Ode to Lost Love,-2004.0,96 min,Drama,6.2,17,Madhu Ambat,Rati Agnihotri,Gulshan Grover,Atul Kulkarni,170,229,1083,1354,625,363
7,1:13:7 Ek Tera Saath,-2016.0,120 min,Horror,5.9,59,Arshad Siddiqui,Pankaj Berry,Anubhav Dhir,Hritu Dudani,20,316,266,1138,216,789
8,100 Days,-1991.0,161 min,"Horror, Romance, Thriller",6.5,983,Partho Ghosh,Jackie Shroff,Madhuri Dixit,Javed Jaffrey,61,323,1374,643,990,862
9,100% Love,-2012.0,166 min,"Comedy, Drama, Romance",5.7,512,Rabi Kinagi,Jeet,Koyel Mallick,Sujoy Ghosh,66,159,1506,681,902,2243


In [16]:
raw_df.describe()

Unnamed: 0,Year,Rating,Votes,Duration_cat,Genre_cat,Director_cat,Actor1_cat,Actor2_cat,Actor3_cat
count,5659.0,5659.0,5659.0,5659.0,5659.0,5659.0,5659.0,5659.0,5659.0
mean,-1996.24757,5.898533,2697.649585,49.706485,171.728221,1211.864994,966.963598,1182.761442,1276.103375
std,19.741839,1.381165,13651.503584,40.810725,108.20877,686.698545,564.890824,666.834013,720.046925
min,-2021.0,1.1,5.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-2013.0,5.0,30.0,25.0,47.0,637.0,468.0,617.0,652.5
50%,-2002.0,6.1,131.0,39.0,184.0,1198.0,971.0,1198.0,1306.0
75%,-1983.0,6.9,922.5,58.0,248.0,1793.0,1472.0,1767.5,1868.5
max,-1931.0,10.0,591417.0,173.0,375.0,2430.0,1959.0,2320.0,2555.0


In [17]:
raw_df.dtypes

Name              object
Year             float64
Duration        category
Genre           category
Rating           float64
Votes              int64
Director        category
Actor 1         category
Actor 2         category
Actor 3         category
Duration_cat       int16
Genre_cat          int16
Director_cat       int16
Actor1_cat         int16
Actor2_cat         int16
Actor3_cat         int16
dtype: object

In [18]:
raw_df['Year'] = raw_df['Year'].apply(lambda x: x*-1)

### EDA on reduced dataset

In [19]:
fig = px.histogram(raw_df,
                   x='Rating',
                   marginal='box',
                   nbins=25,
                   title='Distribution of Ratings')
fig.update_layout(bargap=0.1)
fig.show()

We see that the ratings is still a normal curve

In [20]:
import plotly.express as px
years=raw_df.groupby("Year")["Rating"].mean().reset_index()
px.scatter(years,x="Year", y="Rating").show()

This indicates that users were in liking of the movies which were released in the 1940-1960 era which declined uptil 2000. Later the liking has gone up from 2000 to 2020.

In [21]:
px.scatter(raw_df,x="Director", y="Rating").show()

In [22]:
px.scatter(raw_df,x="Actor 1", y="Rating").show()

In [23]:
px.scatter(raw_df,x="Actor 2", y="Rating").show()

In [24]:
px.scatter(raw_df,x="Actor 3", y="Rating").show()


In [25]:
px.scatter(raw_df,x="Genre", y="Rating").show()

In [26]:
fig = px.scatter_3d(raw_df, x='Rating', y='Year', z= 'Genre')
fig.update_traces(marker_size=3, marker_opacity=0.5)
fig.show()

This graph gives us an idea that which genre was most liked or most criticized during which era

In [27]:
raw_df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3,Duration_cat,Genre_cat,Director_cat,Actor1_cat,Actor2_cat,Actor3_cat
0,#Gadhvi (He thought he was Gandhi),2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid,9,229,629,1352,2272,319
1,#Yaaram,2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor,10,184,1335,1198,719,2148
2,...Aur Pyaar Ho Gaya,1997.0,147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor,47,157,1530,378,75,2045
3,...Yahaan,2005.0,142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma,42,289,2044,692,1112,2524
4,?: A Question Mark,2012.0,82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia,156,320,135,1934,1175,1013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5654,Zubaan,2015.0,115 min,Drama,6.1,408,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana,15,229,1223,1861,1801,1615
5655,Zubeidaa,2001.0,153 min,"Biography, Drama, History",6.2,1496,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee,53,133,2059,763,1619,1184
5656,Zulm Ki Zanjeer,1989.0,125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth,25,28,1793,406,754,1685
5657,Zulmi,1999.0,129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani,29,38,1025,112,2164,314


In [28]:
dataset = raw_df[['Year',	'Duration_cat',	'Genre_cat',	'Rating'	,'Votes',	'Director_cat',	'Actor1_cat',	'Actor2_cat',	'Actor3_cat']]

In [29]:
dataset

Unnamed: 0,Year,Duration_cat,Genre_cat,Rating,Votes,Director_cat,Actor1_cat,Actor2_cat,Actor3_cat
0,2019.0,9,229,7.0,8,629,1352,2272,319
1,2019.0,10,184,4.4,35,1335,1198,719,2148
2,1997.0,47,157,4.7,827,1530,378,75,2045
3,2005.0,42,289,7.4,1086,2044,692,1112,2524
4,2012.0,156,320,5.6,326,135,1934,1175,1013
...,...,...,...,...,...,...,...,...,...
5654,2015.0,15,229,6.1,408,1223,1861,1801,1615
5655,2001.0,53,133,6.2,1496,2059,763,1619,1184
5656,1989.0,25,28,5.8,44,1793,406,754,1685
5657,1999.0,29,38,4.5,655,1025,112,2164,314


We are now ready for data pre processing

#Training


In [30]:
pred = dataset[dataset.columns[dataset.columns!="Rating"]]
targ = dataset["Rating"]

In [31]:
pred

Unnamed: 0,Year,Duration_cat,Genre_cat,Votes,Director_cat,Actor1_cat,Actor2_cat,Actor3_cat
0,2019.0,9,229,8,629,1352,2272,319
1,2019.0,10,184,35,1335,1198,719,2148
2,1997.0,47,157,827,1530,378,75,2045
3,2005.0,42,289,1086,2044,692,1112,2524
4,2012.0,156,320,326,135,1934,1175,1013
...,...,...,...,...,...,...,...,...
5654,2015.0,15,229,408,1223,1861,1801,1615
5655,2001.0,53,133,1496,2059,763,1619,1184
5656,1989.0,25,28,44,1793,406,754,1685
5657,1999.0,29,38,655,1025,112,2164,314


In [32]:
targ

0       7.0
1       4.4
2       4.7
3       7.4
4       5.6
       ... 
5654    6.1
5655    6.2
5656    5.8
5657    4.5
5658    6.2
Name: Rating, Length: 5659, dtype: float64

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( pred, targ, test_size=0.33, random_state=42)

Linear regression

In [34]:
from sklearn.linear_model import LinearRegression

In [35]:
model = LinearRegression()

In [36]:
inputs = dataset[dataset.columns[dataset.columns!="Rating"]]
targets = dataset["Rating"]
print('inputs.shape :', inputs.shape)
print('targets.shape :', targets.shape)

inputs.shape : (5659, 8)
targets.shape : (5659,)


In [37]:
model.fit(inputs, targets)

In [38]:
def rmse(targets, predictions):
    return np.sqrt(np.mean(np.square(targets - predictions)))

In [39]:
# Create and train the model
model = LinearRegression().fit(inputs, targets)

# Generate predictions
predictions = model.predict(inputs)

# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)

Loss: 1.3120091542540553


In [40]:
model.coef_

array([-1.58363000e-02,  3.13133033e-03,  1.51862956e-03,  1.83720404e-05,
        3.76497398e-06,  5.43166207e-05,  5.92327589e-05,  8.12125640e-05])

In [41]:
model.intercept_

36.81492971971791

SVM, RF and KNN regressors

In [42]:
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
# SVM Regressor
svm_reg = SVR()
svm_reg.fit(X_train, y_train)
y_pred_svm = svm_reg.predict(X_test)
rmse_svm = np.sqrt(mean_squared_error(y_test, y_pred_svm))
print("SVM RMSE:", rmse_svm)

# KNN Regressor
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train, y_train)
y_pred_knn = knn_reg.predict(X_test)
rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))
print("KNN RMSE:", rmse_knn)

# Random Forest Regressor
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("Random Forest RMSE:", rmse_rf)


#XGBoost
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("XGB RMSE:", rmse)


SVM RMSE: 1.3125716276012176
KNN RMSE: 1.409744536214835
Random Forest RMSE: 1.109609348999719
XGB RMSE: 1.1321677822031628


##Deep neural nets

LSTM

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


data = dataset


features = ['Year', 'Duration_cat', 'Genre_cat', 'Votes', 'Director_cat', 'Actor1_cat', 'Actor2_cat', 'Actor3_cat']
target = 'Rating'


X = data[features].values
y = data[target].values


scaler = MinMaxScaler()
X = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the input data to match LSTM input shape
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
model = Sequential()
model.add(LSTM(128, input_shape=(1, X_train.shape[2])))
model.add(Dense(1))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=16, verbose=1)

# Evaluate the model
mse = model.evaluate(X_test, y_test, verbose=0)
print('Mean Squared Error:', mse)

# Make predictions
predictions = model.predict(X_test)

# Flatten the predictions
predictions = predictions.flatten()

# Calculate the minimum and maximum values of the target variable
y_min = np.min(y)
y_max = np.max(y)

# Perform inverse normalization on the predicted values
predictions = predictions * (y_max - y_min) + y_min

# Compare actual and predicted values
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(comparison)



Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

RNN

In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense

data = dataset

features = ['Year', 'Duration_cat', 'Genre_cat', 'Votes', 'Director_cat', 'Actor1_cat', 'Actor2_cat', 'Actor3_cat']
target = 'Rating'

X = data[features].values
y = data[target].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the input data to match RNN input shape
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Build the RNN model
model = Sequential()
model.add(SimpleRNN(128, input_shape=(1, X_train.shape[2])))
model.add(Dense(1))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=16, verbose=1)

# Evaluate the model
mse = model.evaluate(X_test, y_test, verbose=0)
print('Mean Squared Error:', mse)

# Make predictions
predictions = model.predict(X_test)

# Flatten the predictions
predictions = predictions.flatten()

# Calculate the minimum and maximum values of the target variable
y_min = np.min(y)
y_max = np.max(y)

# Perform inverse normalization on the predicted values
predictions = predictions * (y_max - y_min) + y_min

# Compare actual and predicted values
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(comparison)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

GRU

In [44]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import GRU, Dense

data = dataset

features = ['Year', 'Duration_cat', 'Genre_cat', 'Votes', 'Director_cat', 'Actor1_cat', 'Actor2_cat', 'Actor3_cat']
target = 'Rating'

X = data[features].values
y = data[target].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the input data to match GRU input shape
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Build the GRU model
model = Sequential()
model.add(GRU(128, input_shape=(1, X_train.shape[2])))
model.add(Dense(1))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=16, verbose=1)

# Evaluate the model
mse = model.evaluate(X_test, y_test, verbose=0)
print('Mean Squared Error:', mse)

# Make predictions
predictions = model.predict(X_test)

# Flatten the predictions
predictions = predictions.flatten()

# Calculate the minimum and maximum values of the target variable
y_min = np.min(y)
y_max = np.max(y)

# Perform inverse normalization on the predicted values
predictions = predictions * (y_max - y_min) + y_min

# Compare actual and predicted values
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(comparison)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Squared Error: 1.6463048458099365
      Actual  Predicted
0        6.0  52.560566
1        2.4  54.690746
2        3.8  53.558342
3        3.8  53.365185
4        7.2  48.505917
...      ...        ...
1127     6.7  55.384151
1128     6.8  51.860092
1129     4.8  52.163387
1130     6.7  54.144920
1131     6.6  56.789967

[1132 rows x 2 columns]


CNN

In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

data = dataset

features = ['Year', 'Duration_cat', 'Genre_cat', 'Votes', 'Director_cat', 'Actor1_cat', 'Actor2_cat', 'Actor3_cat']
target = 'Rating'

X = data[features].values
y = data[target].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the input data to match CNN input shape
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

# Build the 1D CNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=16, verbose=1)

# Evaluate the model
mse = model.evaluate(X_test, y_test, verbose=0)
print('Mean Squared Error:', mse)

# Make predictions
predictions = model.predict(X_test)

# Flatten the predictions
predictions = predictions.flatten()

# Calculate the minimum and maximum values of the target variable
y_min = np.min(y)
y_max = np.max(y)

# Perform inverse normalization on the predicted values
predictions = predictions * (y_max - y_min) + y_min

# Compare actual and predicted values
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(comparison)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

MLP

In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense

data = dataset

features = ['Year', 'Duration_cat', 'Genre_cat', 'Votes', 'Director_cat', 'Actor1_cat', 'Actor2_cat', 'Actor3_cat']
target = 'Rating'

X = data[features].values
y = data[target].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the MLP model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X_train, y_train, epochs=250, batch_size=16, verbose=1)

# Evaluate the model
mse = model.evaluate(X_test, y_test, verbose=0)
print('Mean Squared Error:', mse)

# Make predictions
predictions = model.predict(X_test)

# Flatten the predictions
predictions = predictions.flatten()

# Calculate the minimum and maximum values of the target variable
y_min = np.min(y)
y_max = np.max(y)

# Perform inverse normalization on the predicted values
predictions = predictions * (y_max - y_min) + y_min

# Compare actual and predicted values
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(comparison)


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

We see that out of all the deep neural nets, MLP has performed the best