<a href="https://colab.research.google.com/github/ritik6633/IMDB-rating-prediction/blob/main/IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import squarify
from collections import defaultdict,Counter
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

ModuleNotFoundError: No module named 'squarify'

In [None]:
df = pd.read_csv("movie_metadata.csv")

In [None]:
df.sample(5)

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['gross'].fillna(df['gross'].median(),inplace=True)
df['budget'].fillna(df['budget'].median(),inplace=True)
df.dropna(inplace=True)

In [None]:
df.shape

## About the Data (EDA)

### Creating a new column to show main genre of movie. This will help in sorting out movies according to their genre types

In [None]:
df['main_genre'] = df['genres'].apply(lambda x: x.split('|')[0] if '|' in x else x)

In [None]:
df.sample(2)

In [None]:
plt.figure(figsize=(12,10))
sns.boxplot(x='imdb_score',y='main_genre',data=df)
plt.title('Movie Genres with their imdb scores',fontsize=18)
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns
z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())
threshold = 3
df = df[(z_scores < threshold).all(axis=1)]

In [None]:
df.shape

In [None]:
df.title_year.value_counts(dropna=True).sort_index().plot(kind='barh',figsize=(15,20))
plt.title("Number of Movies released every year",fontsize=18)
plt.show()

In [None]:
df.main_genre.value_counts(dropna=True).sort_values().plot(kind='barh',figsize=(15,20))
plt.title("Number of Movies released every year",fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
squarify.plot(Counter(df['main_genre']).values(),label=Counter(df['main_genre']).keys(),text_kwargs={'fontsize':12}
             ,bar_kwargs={'alpha':.7},pad=True)
plt.title("Genres",fontsize=18)
plt.axis("off")
plt.show()

In [None]:
# Movies with the lowest Imdb rating
df[df['imdb_score']==3.3]

In [None]:
# Movie with the highest Imdb rating
df[df['imdb_score']==8.9]

## Histogram of all columns in df

In [None]:
df.hist(bins=30,figsize=(15,15),color='g')
plt.show()

## Adding New Column that shows number of genres in movie

In [None]:
df['num_genres'] = df.genres.apply(lambda x: len(x.split('|')))

In [None]:
df.sample(2)

In [None]:
df.num_genres.max()

In [None]:
df[df.num_genres==8]

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True,linewidths=.5,
            cmap='coolwarm',square=True,cbar_kws={'label': 'Correlation Coefficient'})
plt.title("Correlation plot",fontsize=18)
plt.show()

### Selected Cols for model
num_critic_for_reviews<br>
duration<br>
num_voted_users<br>
num_user_for_reviews<br>
movie_facbook_likes<br>
director_facebook_likes<br>

In [None]:
X = df[['num_critic_for_reviews','duration','num_voted_users','num_user_for_reviews','movie_facebook_likes'
       ,'director_facebook_likes']]
y = df['imdb_score']

In [None]:
X.shape,y.shape

In [None]:
X_train, X_test, y_train, y_test = tts(X,y,test_size=0.2,random_state=32)

In [None]:
lm = LinearRegression()
lm.fit(X_train,y_train)
pred_lm= lm.predict(X_test)
print("Mean squared error using linear regression",mean_squared_error(y_test,pred_lm))
print("Mean absolute error using linear regression",mean_absolute_error(y_test,pred_lm))

In [None]:
dtc = DecisionTreeRegressor()
dtc.fit(X_train, y_train)
pred_dt = dtc.predict(X_test)
print("Mean squared error using Decision Tree Regression",mean_squared_error(y_test,pred_dt))
print("Mean absolute error using Decision Tree regression",mean_absolute_error(y_test,pred_dt))

In [None]:
svr = SVR(kernel='rbf')
svr.fit(X_train,y_train)
pred_svr = svr.predict(X_test)
print("Mean squared error using Support Vector regression",mean_squared_error(y_test,pred_svr))
print("Mean absolute error using Support Vector regression",mean_absolute_error(y_test,pred_svr))

In [None]:
knn = KNeighborsRegressor(n_neighbors=12)
knn.fit(X_train,y_train)
pred_knn = knn.predict(X_test)
print("Mean squared error using KNN regression",mean_squared_error(y_test,pred_knn))
print("Mean absolute error using KNN regression",mean_absolute_error(y_test,pred_knn))

### Conclusion:
#### Linear Regression model seems to perform best among all with MSE = 0.67 and MAE = 0.65

## To work with Classification models, Adding another column "movie_status" that contains wether movie is hit or (flop or average)

In [None]:
def getStatus(row):
    bgt = row['budget']
    grs = row['gross']

    if bgt*3<=grs:
        return 1
    return 0


In [None]:
df['movie_status'] = df[['budget','gross']].apply(getStatus, axis=1) #1 means HIT, 0 means FLOP

In [None]:
df['movie_status']

In [None]:
y = df['movie_status']
X = df[['num_critic_for_reviews','duration','num_voted_users','num_user_for_reviews','movie_facebook_likes'
       ,'director_facebook_likes']]

In [None]:
X_train, X_test, y_train, y_test = tts(X,y,test_size=0.2,random_state=32)

In [None]:
svc = SVC(kernel='rbf')
svc.fit(X_train,y_train)
pred_svc = svc.predict(X_test)
print("Classification Report for Suppprt Vector Classifier:\n", classification_report(y_test, pred_svc))
accuracy = accuracy_score(y_test, pred_svc)
print("Accuracy score for Support Vector Classifier: ", accuracy)

In [None]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
pred_knn_cls = knn.predict(X_test)
print("Classification Report for KNN Classifier:\n", classification_report(y_test, pred_knn_cls))
accuracy = accuracy_score(y_test, pred_knn_cls)
print("Accuracy score for KNN Classifier: ", accuracy)

In [None]:
dt_cls = DecisionTreeClassifier()
dt_cls.fit(X_train, y_train)
pred_dt_cls = dt_cls.predict(X_test)
print("Classification Report for Decision Tree Classifier:\n", classification_report(y_test, pred_dt_cls))
accuracy = accuracy_score(y_test, pred_dt_cls)
print("Accuracy score for Decision Tree Classifier: ", accuracy)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
pred_rf_cls = rf_classifier.predict(X_test)
print("Classification Report for Random Forest Classifier:\n", classification_report(y_test, pred_rf_cls))
accuracy = accuracy_score(y_test, pred_rf_cls)
print("Accuracy score for Random Forest Classifier: ", accuracy)

In [None]:
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
pred_lr_cls = lr_classifier.predict(X_test)
print("Classification Report for Logistic Regression Classifier:\n", classification_report(y_test, pred_lr_cls))
accuracy = accuracy_score(y_test, pred_lr_cls)
print("Accuracy score for Logistic Regression Classifier: ", accuracy)

### Conclusion:
#### Random Forest Classifier seems to perform best among all, as it has 83% accuracy