In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [None]:
df_movie = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin-1')

In [None]:
df_movie.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [None]:
df_movie.shape

(15509, 10)

In [None]:
df_movie.describe()

Unnamed: 0,Year,Rating
count,14981.0,7919.0
mean,-1987.012215,5.841621
std,25.416689,1.381777
min,-2022.0,1.1
25%,-2009.0,4.9
50%,-1991.0,6.0
75%,-1968.0,6.8
max,-1913.0,10.0


In [None]:
df_movie.isnull().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [None]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  float64
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(2), object(8)
memory usage: 1.2+ MB


In [None]:
df_movie.duplicated().sum()

6

In [None]:
df_movie.dropna(inplace=True)

In [None]:
df_movie.shape

(5659, 10)

In [None]:
df_movie.isnull().sum()

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [None]:
df_movie.drop_duplicates(inplace=True)

In [None]:
df_movie.shape

(5659, 10)

In [None]:
df_movie.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

In [None]:
df_movie['Duration'] = pd.to_numeric(df_movie['Duration'].str.replace('min',''))

In [None]:
df_movie['Genre'] = df_movie['Genre'].str.split(',')
df_movie = df_movie.explode('Genre')
df_movie['Genre'].fillna(df_movie['Genre'].mode()[0], inplace=True)

In [None]:
df_movie['Votes'] = pd.to_numeric(df_movie['Votes'].str.replace(',',''))



In [None]:
year = px.histogram(df_movie,x = 'Year', histnorm='probability density',nbins = 30)
year.show()



In [None]:
avg_rating = df_movie.groupby(['Year','Genre'])['Rating'].mean().reset_index()
top_genres = df_movie['Genre'].value_counts().head(10).index
avg_rating = avg_rating[avg_rating['Genre'].isin(top_genres)]
fig = px.line(avg_rating, x='Year', y='Rating', color='Genre')
fig.update_layout(title='Averagee rating by top genres by year',xaxis_title='Average rating')
fig.show()




In [None]:
rating_fig = px.histogram(df_movie, x='Rating', histnorm='probability density',nbins = 30)
rating_fig.update_layout(title='distribution of rating',title_x=0.5,title_pad=dict(t=20),title_font=dict(size=20),xaxis_title='Rating',yaxis_title='Probability density')
rating_fig.show()


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score



In [None]:
df_movie.drop('Name', axis = 1,inplace = True)


In [None]:
genre_mean_rating = df_movie.groupby('Genre')['Rating'].transform('mean')
df_movie['Genre_mean_rating'] = genre_mean_rating
director_mean_rating= df_movie.groupby('Director')['Rating'].transform('mean')
df_movie['Director_encoded'] = director_mean_rating
actor1_mean_rating= df_movie.groupby('Actor 1')['Rating'].transform('mean')
df_movie['Actor1_encoded'] = actor1_mean_rating
actor2_mean_rating= df_movie.groupby('Actor 2')['Rating'].transform('mean')
df_movie['Actor2_encoded'] = actor2_mean_rating
actor3_mean_rating= df_movie.groupby('Actor 3')['Rating'].transform('mean')
df_movie['Actor3_encoded'] = actor3_mean_rating


In [None]:
x = df_movie[['Year','Votes','Duration','Genre_mean_rating','Director_encoded','Actor1_encoded','Actor2_encoded','Actor3_encoded']]
y = df_movie['Rating']


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
Model = LinearRegression()
Model.fit(x_train,y_train)
Model_pred = Model.predict(x_test)

In [None]:
print('The performance evaluation of logistic regression is below:')
print('mean squared error:',mean_squared_error(y_test,Model_pred))
print('mean absolute error:',mean_absolute_error(y_test,Model_pred))
print('r2 score:',r2_score(y_test,Model_pred))


The performance evaluation of logistic regression is below:
mean squared error: 0.4463977880886115
mean absolute error: 0.4921055068501125
r2 score: 0.7641906900948995


In [None]:

x.head()

Unnamed: 0,Year,Votes,Duration,Genre_mean_rating,Director_encoded,Actor1_encoded,Actor2_encoded,Actor3_encoded
1,-2019.0,8,109,6.248697,7.0,6.85,7.0,7.0
3,-2019.0,35,110,5.838423,4.4,5.25,4.4,4.46
3,-2019.0,35,110,5.838739,4.4,5.25,4.4,4.46
5,-1997.0,827,147,5.838423,5.335135,4.793617,5.73,5.93
5,-1997.0,827,147,5.875793,5.335135,4.793617,5.73,5.93


In [None]:
y.head()

1    7.0
3    4.4
3    4.4
5    4.7
5    4.7
Name: Rating, dtype: float64

In [None]:
#just took an data as an example to show how the model works
data = {'Year': [2019], 'Votes': [35], 'Duration':[110], 'Genre_mean_rating': [5.8], 'Director_encoded': [4.4], 'Actor1_encoded':[5.25],'Actor2_encoded':[4.40], 'Actor3_encoded':[4.46] }
trail = pd.DataFrame(data)

In [None]:
rating_predicted = Model.predict(trail)
print('predicted rating:',rating_predicted[0])

predicted rating: -0.6759121179860181
