In [1]:
#Importing necessary Libraries
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# from google.colab import files
# uploaded = files.upload()

In [3]:
movies = pd.read_csv('movie_metadataFinal.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'movie_metadataFinal.csv'

In [None]:
# Printing the first 5 entries in the dataset 
# to see how the dataset looks like
movies.head()

In [None]:
movies.shape, movies.columns

#  1. Description of dataset features

1. **Color** : Movie is black or coloured
2. **Director_name**: Name of the movie director
3. **num_critic_for_reviews** : No of critics for the movie
4. **duration**: Movie duration in minutes
5. **director_facebook_likes**: Number of likes for the Director on his Facebook Page
6. **actor_3_facebook_likes**: No of likes for the actor 3 on his/her facebook Page
7. **actor2_name**: name of the actor 2
8. **actor_1_facebook_likes**: No of likes for the actor 1 on his/her facebook Page
9. **gross**: Gross earnings of the movie in Dollars
10. **genres**: Film categorization like ‘Animation’, ‘Comedy’, ‘Romance’, ‘Horror’, ‘Sci-Fi’, ‘Action’, ‘Family’
11. **actor_1_name**: Name of the actor 1
12. **movie_title**: Title of the movie
13. **num_voted_users**: No of people who voted for the movie
14. **cast_total_facebook_likes**: Total facebook like for the movie
15. **actor_3_name**: Name of the actor 3
16. **facenumber_in_poster**: No of actors who featured in the movie poster
17. **plot_keywords**: Keywords describing the movie plots
18. **movie_imdb_link**: Link of the movie link
19. **num_user_for_reviews**: Number of users who gave a review
20. **language**: Language of the movie
21. **country**: Country where movie is produced
22. **content_rating**: Content rating of the movie
23. **budget**: Budget of the movie in Dollars
24. **title_year**: The year in which the movie is released
25. **actor_2_facebook_likes**: Facebook likes for the actor 2
26. **imdb_score**: IMDB score of the movie
27. **aspect_ratio** : Aspect ratio the movie was made in
28. **movie_facebook_likes**: Total no of facebook likes for the movie




## 2. Data cleaning

In [None]:
#No of the missing values in the dataset
movies.isna().sum()

In [None]:
# Imputing the dataframe
def replace_missing_values(data):
    # Get the column names by data types
    col_names = data.columns
    col_types = data.dtypes

    # Loop through each column and replace missing values
    for i in range(len(col_names)):
        col_name = col_names[i]
        col_type = col_types[i]
        
        if col_type == 'object':
            # For string columns, use bfill and ffill methods
            data[col_name] = data[col_name].fillna(method='bfill').fillna(method='ffill')
        else:
            # For numerical columns, replace missing values with the mean of the column
            data[col_name] = data[col_name].fillna(movies[col_name].mean())
    
    


In [None]:
# Imputing all the missing cells
replace_missing_values(movies)

In [None]:
# Get the count of missing values per row
missing_per_row = movies.isnull().sum(axis=1)

# Get the total number of rows with missing values
total_missing_rows = len(missing_per_row[missing_per_row > 0])

print(total_missing_rows)

In [None]:
#Checking to see if there are duplicates
movies.duplicated().sum()

In [None]:
#Removing the duplicate values in the datset
movies.drop_duplicates(inplace=True)
movies.duplicated().sum()
movies.shape

In [None]:
movies.info()

In [None]:
# Changing 'title year' to int
movies['title_year'] = movies['title_year'].apply(np.int64)


# 2. Feauture engineering

In [None]:
#Describing the categorical data
movies.describe(include='object')

In [None]:
# Droping'movie_title','movie_imdb_link' columns. 
#  This is becuase they are almost unique, so don't contribute match to our target variable
movies.drop(columns=['movie_title', 'movie_imdb_link','director_name','actor_1_name','actor_3_name','actor_2_name','plot_keywords','language','country'],axis=1, inplace=True)

In [None]:
# Label encoding the categorical columns
from sklearn.preprocessing import LabelEncoder
feature_encoding = LabelEncoder()
categorical_features=['color', 
        'genres', 
         'content_rating',
       'title_year', 'aspect_ratio']
movies[categorical_features]=movies[categorical_features].apply(lambda x:feature_encoding.fit_transform(x))

In [None]:
#A sample of data after label encoding
movies.head()

# Correlation

In [None]:
# Finding whether there is any relation between variables,
#  in other terms multicollinearity.

corr_matrix = movies.corr()
mask = np.zeros(corr_matrix.shape, dtype=bool)
mask[np.triu_indices(len(mask))] = True
plt.subplots(figsize=(20,15))
sns.heatmap(corr_matrix, xticklabels=corr_matrix.columns, yticklabels=corr_matrix.columns,cmap='RdYlGn',annot=True,mask = mask)

In [None]:
# Removing few columns due to multicollinearity
movies.drop(columns=['cast_total_facebook_likes','num_critic_for_reviews'], axis=1,inplace=True)

# Categorizing IMDB_score(our target variable)

In [None]:
# Categorising the target variable 
bins = [ 1, 4.5, 8, 10]
labels = ['Flop Movie', 'Average Movie', 'Hit Movie']
movies['movie_performance'] = pd.cut(movies['imdb_score'], bins=bins, labels=labels)

In [None]:
# Removing the column "imdb_score" since we have "movie_performance"
movies.drop(columns=['imdb_score'],inplace=True)

# Model Building

In [None]:
datasetR = movies.copy() #lets keep our original dataset for reference. Here datasetR is for Regression model
datasetC = movies.copy() #Here datasetC is for classification model

In [None]:
datasetC.shape

In [None]:
#Dependent/Target Variable
y = datasetC.pop('movie_performance')

#Independent Variables
X = datasetC

y


#  Train Test Split

In [None]:
#Spliting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 42)
print(X_train.shape)
print(y_train.shape)


# Scaling

In [None]:
#Scaling the dependent variables
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train.values), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(scaler.transform(X_test.values), columns = X_train.columns, index = X_test.index)

In [None]:
X_train

# Feature selection

In [None]:
#Performing Recursive Feauture Elimation with Cross Validation
#Using Random forest for RFE-CV and logloss as scoring
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
forest_model=RandomForestClassifier(random_state=0)
rfecv=RFECV(estimator=forest_model, step=1,cv=5,scoring='neg_log_loss')
rfecv=rfecv.fit(X_train,y_train)

In [None]:
#Optimal number of features
# X_train = pd.DataFrame(X_train)
# X_test = pd.DataFrame(X_test)
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_train.columns[rfecv.support_])

In [None]:
list(zip(X_train.columns,rfecv.support_,rfecv.ranking_))

In [None]:
col_rfecv = X_train.columns[rfecv.support_]

In [None]:
#Creating an X_train and an X_test dataframe with rfecv variables
X_train_rfecv = X_train[col_rfecv]
X_test_rfecv =  X_test[col_rfecv]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestClassifier(random_state = 0)
forest_model.fit(X_train_rfecv, y_train)
predicted_rating = forest_model.predict(X_test_rfecv)
print(predicted_rating[1:20])

# Check accuracy score 

from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, predicted_rating)))

In [None]:
from sklearn import metrics
count_misclassified = (y_test != predicted_rating).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, predicted_rating)
print('Accuracy: {:.2f}'.format(accuracy))
precision = metrics.precision_score(y_test, predicted_rating, average= 'macro')
print('Precision: {:.2f}'.format(precision))
recall = metrics.recall_score(y_test, predicted_rating, average= 'macro')
print('Recall: {:.2f}'.format(recall))
f1_score = metrics.f1_score(y_test, predicted_rating, average = 'macro')
print('F1 score: {:.2f}'.format(f1_score))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
# View the classification report for test data and predictions
print(classification_report(y_test, predicted_rating))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestClassifier(n_estimators = 100, max_depth = 20,  criterion = 'entropy', random_state = 0)
forest_model.fit(X_train_rfecv, y_train)
improved_movie_prediction = forest_model.predict(X_test_rfecv)
print(improved_movie_prediction[1:20])

In [None]:
import joblib
joblib.dump(forest_model, 'rf_model.sav')

In [None]:
from sklearn.metrics import classification_report
improved_report = classification_report(y_test,improved_movie_prediction)
print(improved_report)