In [None]:
#Import libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *





> **1. Introduction**

**1.1 Background**

This dataset contains the information about the movies . For a movie to be commercial success , it depends on various factors like director, actors ,critic reviews and viewers reaction. Imdb score is one of the important factor to measure the movie's success. 

**1.2 Description of dataset attributes**

Please find the details for the datset attributes:-

1. Color :- Movie is black or coloured
2. Director_name:- Name of the movie director
3. num_critic_for_reviews :- No of critics for the movie
4. duration:- movie duration in minutes
5. director_facebook_likes:-Number of likes for the Director on his Facebook Page
6. actor_3_facebook_likes:- No of likes for the actor 3 on his/her facebook Page
7. actor2_name:- name of the actor 2
8. actor_1_facebook_likes:- No of likes for the actor 1 on his/her facebook Page
9. gross:- Gross earnings of the movie in Dollars
10. genres:- Film categorization like ‘Animation’, ‘Comedy’, ‘Romance’, ‘Horror’, ‘Sci-Fi’, ‘Action’, ‘Family’
11. actor_1_name:- Name of the actor 1
12. movie_title:-Title of the movie
13. num_voted_users:-No of people who voted for the movie
14. cast_total_facebook_likes:- Total facebook like for the movie
15. actor_3_name:- Name of the actor 3
16. facenumber_in_poster:- No of actors who featured in the movie poster
17. plot_keywords:-Keywords describing the movie plots
18. movie_imdb_link:-Link of the movie link
19. num_user_for_reviews:- Number of users who gave a review
20. language:- Language of the movie 
21. country:- Country where movie is produced
22. content_rating:- Content rating of the movie
23. budget:- Budget of the movie in Dollars
24. title_year:- The year in which the movie is released
25. actor_2_facebook_likes:- facebook likes for the actor 2
26. imdb_score:- IMDB score of the movie
27. aspect_ratio :- Aspect ratio the movie was made in
28. movie_facebook_likes:- Total no of facebook likes for the movie
    

**1.3 Case Study**


The dataset here gives the massive information about the movies and their IMDB scores respectively. We are going to analyze each and every factors which can influence the imdb ratings so that we can predict better results.The movie with the higher imdb score is more successful as compared to the movies with low imdb score. 

**2. Data Preprocessing**

In [None]:
#Reading the Data 

movie_df=pd.read_csv("/kaggle/input/imdb-5000-movie-dataset/movie_metadata.csv")

In [None]:
#Displaying the first 10 records

movie_df.head(10)

In [None]:
#Shape of the dataset (no of rows and no of columns)

movie_df.shape

In [None]:
#Displaying the data type of the dataset attributes 

movie_df.dtypes

**We can say we have the datset divided into categorical and numeric columns "

**Categorical Columns**

Color,Director name, actor name,genres,movie_title,language,country,content_rating.

**Numerical Columns**

num_critic_for_reviews,duration,director_facebook_likes ,actor_3_facebook_likes,actor_1_facebook_likes ,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews ,budget,title_year, actor_2_facebook_likes ,imdb_score,aspect_ratio,movie_facebook_likes


In [None]:
#Five point summary for the numerical columns in the dataset

movie_df.describe().T

In [None]:
#Dropping the Imdb link from the dataset

movie_df.drop('movie_imdb_link', axis=1, inplace=True)

In [None]:
#Removing the color section as most of the movies is colored

movie_df["color"].value_counts()

movie_df.drop('color',axis=1,inplace=True)

In [None]:
#Checking for the columns present in the datset
movie_df.columns

In [None]:
#Checking for the missing values in the dataset

movie_df.isna().any()



In [None]:
#No of the missing values in the dataset

movie_df.isna().sum()

In [None]:
# We can remove the null values from the dataset where the count is less . so that we don't loose much data 

movie_df.dropna(axis=0,subset=['director_name', 'num_critic_for_reviews','duration','director_facebook_likes','actor_3_facebook_likes','actor_2_name','actor_1_facebook_likes','actor_1_name','actor_3_name','facenumber_in_poster','num_user_for_reviews','language','country','actor_2_facebook_likes','plot_keywords'],inplace=True)



In [None]:
movie_df.shape

** We lost only 6% of the data which is acceptable**

In [None]:
#Replacing the content rating with Value R as it has highest frequency

movie_df["content_rating"].fillna("R", inplace = True) 

In [None]:
#Replacing the aspect_ratio with the median of the value as the graph is right skewed 

movie_df["aspect_ratio"].fillna(movie_df["aspect_ratio"].median(),inplace=True)

In [None]:
#We need to replace the value in budget with the median of the value

movie_df["budget"].fillna(movie_df["budget"].median(),inplace=True)


In [None]:
# We need to replace the value in gross with the median of the value 

movie_df['gross'].fillna(movie_df['gross'].median(),inplace=True)

In [None]:
# Recheck that all the null values are removed

movie_df.isna().sum()



**We don't have any null values in the datset anymore**

In [None]:
#Removing the duplicate values in the datset

movie_df.drop_duplicates(inplace=True)
movie_df.shape

In [None]:
#Count of the language values 

movie_df["language"].value_counts()

**94 % of the movie is english**

In [None]:
# Graphical presentaion 
plt.figure(figsize=(40,10))
sns.countplot(movie_df["language"])
plt.show()

In [None]:
#Most of the values for the languages is english we can drop the english column

movie_df.drop('language',axis=1,inplace=True)

In [None]:
#Creating a new column to check the net profit made by the company (Gross-Budget) 

movie_df["Profit"]=movie_df['budget'].sub(movie_df['gross'], axis = 0) 

movie_df.head(5)

In [None]:
#Creating a new column to check the profit percentage made by the company 

movie_df['Profit_Percentage']=(movie_df["Profit"]/movie_df["gross"])*100
movie_df

**So we have added two new columns  profit and profit percentage made by the movies**

In [None]:
#Value counts for the countries 

value_counts=movie_df["country"].value_counts()
print(value_counts)

**We can see most of the movies are from USA ,UK and the rest of the countries**

In [None]:
##get top 2 values of index
vals = value_counts[:2].index
print (vals)
movie_df['country'] = movie_df.country.where(movie_df.country.isin(vals), 'other')


In [None]:
#Successfully divided the country into three catogories 
movie_df["country"].value_counts()

In [None]:
movie_df.head(10)

**3. Data Visualization**

In [None]:
#Checking for the movies released year wise 

(ggplot(movie_df)         # defining what data to use
 + aes(x='title_year')    # defining what variable to use
 + geom_bar(size=20) # defining the type of plot to use
)

** We can see the most of the movies which are released after 1980 **

In [None]:
#Relationship between the imdb score and the profit made by the movie 

ggplot(aes(x='imdb_score', y='Profit'), data=movie_df) +\
    geom_line() +\
    stat_smooth(colour='blue', span=1)


** We can see that there is strong corelation between the imdb_score and the profit . The movies with high imdb rating have made more profit**

In [None]:
# Relationship between imdb score and profit percentage

ggplot(aes(x='imdb_score', y='Profit'), data=movie_df) +\
    geom_line() +\
    stat_smooth(colour='blue', span=1)

**Movies with high IMDB has made more percentage**

In [None]:
#Checking for the imdb rating of the movies and compared with the countries  

ggplot(aes(x='country', y='imdb_score'), data=movie_df) +\
    geom_line() +\
    stat_smooth(colour='blue', span=1)

**Most of the movies above rating 8.75 are from USA**

In [None]:
#Finding the corelation between imdb_rating with respect to no of facebook likes 

(ggplot(movie_df)
 + aes(x='imdb_score', y='movie_facebook_likes')
 + geom_line()
 + labs(title='IMDB_Score vs. Facebook like for Movies', x='IMDB scores', y='Facebook Likes for movies')
)

**Movie with high IMDB rating have most no of facebook likes**

In [None]:
#Top 20 movies based on the profit they made

plt.figure(figsize=(10,8))
movie_df= movie_df.sort_values(by ='Profit' , ascending=False)
movie_df_new=movie_df.head(20)
ax=sns.pointplot(movie_df_new['Profit'], movie_df_new['budget'], hue=movie_df_new['movie_title'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()


In [None]:
# Top 20 movies based on the profit percentage
plt.figure(figsize=(10,8))
movie_df= movie_df.sort_values(by ='Profit_Percentage' , ascending=False)
movie_df_new=movie_df.head(20)
ax=sns.pointplot(movie_df_new['Profit_Percentage'], movie_df_new['budget'], hue=movie_df_new['movie_title'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
#Top 20 directors based on the IMDB ratings
plt.figure(figsize=(10,8))

movie_df= movie_df.sort_values(by ='imdb_score' , ascending=False)
movie_df_new=movie_df.head(20)
ax=sns.pointplot(movie_df_new['director_name'], movie_df_new['imdb_score'], hue=movie_df_new['movie_title'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()


In [None]:
#Commercial success vs critial acclaim
movie_df= movie_df.sort_values(by ='Profit_Percentage' , ascending=False)
movie_df_new=movie_df.head(20)
(ggplot(movie_df_new)
 + aes(x='imdb_score', y='gross',color = "content_rating")
 + geom_point()
 +  geom_hline(aes(yintercept = 600)) + 
  geom_vline(aes(xintercept = 10)) + 
  xlab("Imdb score") + 
  ylab("Gross money earned in million dollars") + 
  ggtitle("Commercial success Vs Critical acclaim") +
  annotate("text", x = 8.5, y = 700, label = "High ratings \n & High gross"))

**Movies with High content rating were not commercial success**

In [None]:
#Top 20 actors of movies based on the commerical success

plt.figure(figsize=(10,8))

movie_df= movie_df.sort_values(by ='Profit_Percentage' , ascending=False)
movie_df_new=movie_df.head(20)
ax=sns.pointplot(movie_df_new['actor_1_name'], movie_df_new['Profit_Percentage'], hue=movie_df_new['movie_title'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
#Top 20 actors of movies based on the imdb rating of the movies 

plt.figure(figsize=(10,8))

movie_df= movie_df.sort_values(by ='imdb_score' , ascending=False)
movie_df_new=movie_df.head(20)
ax=sns.pointplot(movie_df_new['actor_1_name'], movie_df_new['imdb_score'], hue=movie_df_new['movie_title'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()


In [None]:
# Country of Top 20 movies based on imdb rating

plt.figure(figsize=(10,8))

movie_df= movie_df.sort_values(by ='imdb_score' , ascending=False)
movie_df_new=movie_df.head(20)
ax=sns.pointplot(movie_df_new['country'], movie_df_new['imdb_score'], hue=movie_df_new['movie_title'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

**4.Data Preparation for the models**

**4.1 Removing the Columns with names** 

In [None]:
#Removing the director name column

movie_df.drop('director_name', axis=1, inplace=True)


In [None]:
#Removing the actor1 ,actor 2 and actor 3 names 

movie_df.drop('actor_1_name',axis=1,inplace=True)

In [None]:
movie_df.drop('actor_2_name',axis=1,inplace=True)

In [None]:
movie_df.drop('actor_3_name',axis=1,inplace=True)

In [None]:
#Dropping the movie title 

movie_df.drop('movie_title',axis=1,inplace=True)

In [None]:
# Dropping the plot keywords
movie_df.drop('plot_keywords',axis=1,inplace=True)

In [None]:
#Value count of genres

movie_df['genres'].value_counts()

In [None]:
#Most of the values are equally distributed in genres column ,so we can remove the genres column

movie_df.drop('genres',axis=1,inplace =True)

**4.2 Remove the linear dependant variables****

In [None]:
# Dropiing the profit column from the dataset
movie_df.drop('Profit',axis=1,inplace=True)

In [None]:
#Dropping the profit percentage column from the dataset

movie_df.drop('Profit_Percentage',axis=1,inplace=True)

**4.3 Remove the coreelated variables**

In [None]:
# Correlation with heat map
import matplotlib.pyplot as plt
import seaborn as sns
corr = movie_df.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

** We can see that the cast_total_facebook_likes and actor_1_facebook_like are highly correlated to each other. Both actor2 and actor3 are also somehow correlated to the total. So we want to modify them into two variables: actor_1_facebook_likes and other_actors_facebook_likes.

There are high correlations among num_voted_users, num_user_for_reviews and num_critic_for_reviews. We want to keep num_voted_users and take the ratio of num_user_for_reviews and num_critic_for_reviews.

In [None]:
#Adding the facebook likes of actor 2 and actor 3 together 
movie_df['Other_actor_facebbok_likes']=movie_df["actor_2_facebook_likes"] + movie_df['actor_3_facebook_likes']



In [None]:
#Dropping the actor 2 and actor 3 facebook likes columns as they have been added together 

movie_df.drop('actor_2_facebook_likes',axis=1,inplace=True)


In [None]:

movie_df.drop('actor_3_facebook_likes',axis=1,inplace=True)

In [None]:
movie_df.drop('cast_total_facebook_likes',axis=1,inplace=True)

In [None]:
#Ratio of the ratio of num_user_for_reviews and num_critic_for_reviews.

movie_df['critic_review_ratio']=movie_df['num_critic_for_reviews']/movie_df['num_user_for_reviews']

In [None]:
#Dropping the num_critic_for_review

movie_df.drop('num_critic_for_reviews',axis=1,inplace=True)
movie_df.drop('num_user_for_reviews',axis=1,inplace=True)

In [None]:
# New Correlation matrix shown in the figure 

import matplotlib.pyplot as plt
import seaborn as sns
corr = movie_df.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

Now we can see none of the attributes are not much correlated to each other.All are below 0.7 

In [None]:
# We need to categorize the imdb values in the range of 0-4,4-6,6-8 and 8-10 to mark them as the bad,average,good and excellent movies respectively

movie_df["imdb_binned_score"]=pd.cut(movie_df['imdb_score'], bins=[0,4,6,8,10], right=True, labels=False)+1




In [None]:
#Dropping the imdb_score column as it is being replaced with the imdb_binned_score values 
movie_df.drop('imdb_score',axis=1,inplace=True)

In [None]:
movie_df.head(5)

**5. Handling the categorical data**

In [None]:
movie_df = pd.get_dummies(data = movie_df, columns = ['country'] , prefix = ['country'] , drop_first = True)
movie_df = pd.get_dummies(data = movie_df, columns = ['content_rating'] , prefix = ['content_rating'] , drop_first = True)



In [None]:
movie_df.columns

** 6. Splitting the data into training and test data**

In [None]:

X=pd.DataFrame(columns=['duration','director_facebook_likes','actor_1_facebook_likes','gross','num_voted_users','facenumber_in_poster','budget','title_year','aspect_ratio','movie_facebook_likes','Other_actor_facebbok_likes','critic_review_ratio','country_USA','country_other','content_rating_G','content_rating_GP','content_rating_M','content_rating_NC-17','content_rating_Not Rated','content_rating_PG','content_rating_PG-13','content_rating_Passed','content_rating_R','content_rating_TV-14','content_rating_TV-G','content_rating_TV-PG','content_rating_Unrated','content_rating_X'],data=movie_df)
y=pd.DataFrame(columns=['imdb_binned_score'],data=movie_df)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=100)

**7.Feature scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

**8. Classification Model Selection**

**8.1 Logistic Regression**

In [None]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
logit =LogisticRegression()
logit.fit(X_train,np.ravel(y_train,order='C'))
y_pred=logit.predict(X_test)

In [None]:
#Confusion matrix for logistic regression**

from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

**8.2 KNN**

In [None]:
#KNN 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=22)
knn.fit(X_train, np.ravel(y_train,order='C'))
knnpred = knn.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, knnpred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, knnpred))

**8.3 SVC**

In [None]:
#SVC
from sklearn.svm import SVC
svc= SVC(kernel = 'sigmoid')
svc.fit(X_train, np.ravel(y_train,order='C'))
svcpred = svc.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, svcpred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, svcpred))

**8.4 Naive Bayes**

In [None]:
#Naive bayes

from sklearn.naive_bayes import GaussianNB
gaussiannb= GaussianNB()
gaussiannb.fit(X_train, np.ravel(y_train,order='C'))
gaussiannbpred = gaussiannb.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, gaussiannbpred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, gaussiannbpred))

**8.5 Decision Tree**



In [None]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='gini') #criterion = entopy, gini
dtree.fit(X_train, np.ravel(y_train,order='C'))
dtreepred = dtree.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, dtreepred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, dtreepred))

**8.6 Ada Boosting**

In [None]:
#Ada Boosting
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(base_estimator=dtree, n_estimators=60)
abcl=abcl.fit(X_train,np.ravel(y_train,order='C'))
abcl_pred=abcl.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, abcl_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, abcl_pred))

**8.7 Random Forest**

In [None]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 200)#criterion = entopy,gini
rfc.fit(X_train, np.ravel(y_train,order='C'))
rfcpred = rfc.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, rfcpred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, rfcpred))

**8.8 Bagging Classifier**[](http://)

In [None]:
new_movie_df=movie_df.pop("imdb_binned_score")


In [None]:
#Bagging classfier

from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=60, max_samples=.7 , oob_score=True)

bgcl = bgcl.fit(movie_df, new_movie_df)
print(bgcl.oob_score_)

**8.9 Gradient Boosting**



In [None]:
#Gradient boosting

from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.09, max_depth=5)
gbcl = gbcl.fit(X_train,np.ravel(y_train,order='C'))
test_pred = gbcl.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, test_pred)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, test_pred))

**8.10 XGBooosting**



In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, np.ravel(y_train,order='C'))
xgbprd = xgb.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, xgbprd)
print(cnf_matrix)
print("Accuracy:",metrics.accuracy_score(y_test, xgbprd))

**9.Model Comparison**



In [None]:
from sklearn.metrics import classification_report

print('Logistic  Reports\n',classification_report(y_test, y_pred))
print('KNN Reports\n',classification_report(y_test, knnpred))
print('SVC Reports\n',classification_report(y_test, svcpred))
print('Naive BayesReports\n',classification_report(y_test, gaussiannbpred))
print('Decision Tree Reports\n',classification_report(y_test, dtreepred))
print('Ada Boosting\n',classification_report(y_test, abcl_pred))
print('Random Forests Reports\n',classification_report(y_test, rfcpred))
print('Bagging Clasifier',bgcl.oob_score_) 
print('Gradient Boosting',classification_report(y_test, test_pred))
print('XGBoosting\n',classification_report(y_test, xgbprd))

**10.Conclusion**

The conclusion is that Random Forest Algorithm along with the gradient boosting have the accuracy of 74.5 and 75.5 respectively