In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
%pylab inline
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Read the data from csv
df=pd.read_csv('/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv')

In [None]:
df.head(3)

## Data Description

In [None]:
#Dataset Info
df.info()
print("Shape : ", df.shape)

## Exploratory Data Analysis 

In [None]:
#Checking for missing values
df.isna().any()

In [None]:
#Heat map to visualising the features having missing values
sns.heatmap(df.isnull(),yticklabels=False, cbar=False, cmap='viridis')

#### There are large number of missing values in gross revenue followed by meta score

In [None]:
#Drop columns like poster link, overview, series title as they may not impact the Gross revenue(target variable)
df.drop(columns=['Poster_Link','Overview','Series_Title'],inplace=True)

In [None]:
#Count of missing values
df.isna().sum()

In [None]:
# Filling missing values with 0 
df.Gross.fillna(value='0', inplace=True)

In [None]:
# Replacing the ',' in gross with space
df['Gross'] = df.Gross.apply(lambda x: int(x.replace(',','')))

In [None]:
# Calculating gross mean
Gross_mean=df.Gross.mean()/(len(df)-len(df[df.Gross==0]))
Gross_mean

In [None]:
# Replacing 0 with the gross mean
df.Gross.replace(to_replace=0,value=Gross_mean, inplace=True)

In [None]:
# Creating a set of all the unique genres.
Genre_list=set()

for row in df.Genre:
    Genre_list.update(row.strip().split(", "))

Genre_list

In [None]:
# Categorising the below as child restricted categories
Genre_child_prohibited=['Action','Crime','Horror','Mystery','Romance','Thriller','War','Western']

In [None]:
#Fill the missing values in Certificate using the genre column.If the Genre belongs to one of the child restricted ones
# fill with 'A' certificate
def fill(row):
    if pd.isna(row['Certificate']):
        for i in Genre_child_prohibited:
            if i in row['Genre']:
                row['Certificate']='A'
                break
    return row

In [None]:
df=df.apply(fill, axis=1)

In [None]:
#We see about 70 missing values in Certificate column is filled using above function
df.isna().sum()

In [None]:
# Filling remaining missing values using 'PG-13'
df.Certificate.fillna(value='PG-13', inplace=True)

#### As the movie is part of two or more genres, the number of viewers are likely to increase, which in turn can impact the gross revenue.
#### Hence creating new columns for each genre 

In [None]:
#Concating the genre list as a new dataframe to the original dataframe
df_new=pd.concat([df,pd.DataFrame(columns=Genre_list, dtype=int)])

In [None]:
df_new.head()

In [None]:
# Function to mark individual genres for the movies
def fill_values(row):
    Genre_list=row['Genre'].split(", ")
    for value in Genre_list:
        row[value]=1
    return row

In [None]:
df=df_new.apply(fill_values, axis=1)

In [None]:
# As unique star cast is a large value, we drop it and don't convert into dummies. Similarly drop genre
df.drop(columns=['Star1','Star2','Star3','Star4','Genre'], inplace=True)

In [None]:
# Filling missing values with meta score mean
df.Meta_score.fillna(value=df.Meta_score.mean(), inplace=True)

In [None]:
# Removing 'min' from runtime to convert to numeric data type
df['Runtime']=df['Runtime'].apply(lambda x:x.replace('min',''))

In [None]:
# Remove the row containing 'PG' in released year as it is inappropriate data filled in year column
df[df['Released_Year']=='PG']
df.set_index('Released_Year', inplace=True)
df.drop(['PG'], inplace=True)

In [None]:
# Convert data type of runtime and released year to int 
df.reset_index(inplace=True)
df = df.astype({"Runtime": int, 'Released_Year':int})

In [None]:
# Filling remaining NaN values with 0
df.fillna(value=0, inplace=True)

In [None]:
# Dummies for categorical columns of Certificate, Director, Genre_count
df=pd.get_dummies(columns=['Certificate','Director'], drop_first=True, data=df)

In [None]:
df.head()

## Data Visualization


In [None]:
df.groupby('IMDB_Rating')['Gross'].count().plot()
xlabel('IMDB rating')
ylabel('Number of movies')

## This shows that most of the movie released are having rating around 7-8

In [None]:
df.groupby('Runtime')['Gross'].mean().plot()
xlabel('Runtime (in minutes)')
ylabel('Gross Revenue (in 1 bn)')

## This shows that people are more interested in movies having runtime between 150- 200 minutes.

In [None]:
df.groupby('Released_Year')['Gross'].sum().plot()
xlabel('Released Year')
ylabel('Gross Revenue (in 100 mn)')

## Over the years, gross revenue has steady increase till 2020, where pandemic caused the heavy decline. 

In [None]:
df.groupby('Released_Year')['Gross'].count().plot()
xlabel('Released Year')
ylabel('Number of movies')

## The number of movies released per year has increased from 1990 at a higher rate compared to before.This in turn reflected in the increase in gross revenue.

In [None]:
# train,test split to 80:20, target variable is Gross revenue
from sklearn.model_selection import train_test_split
target=df.Gross
X_train, X_test, y_train, y_test = train_test_split(df.drop('Gross', axis=1),target,test_size=0.20, random_state=0)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
model = XGBRegressor(n_estimators=200, max_depth=5)
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
score = model.score(X_train, y_train)
print('Training Score:', score)
score = model.score(X_test, y_test)
print('Testing Score:', score)
output = pd.DataFrame({'Predicted':Y_pred})

In [None]:
model.score(df.drop('Gross', axis=1),target)

## The R2 score for the entire dataset is 0.89