In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install pycaret 

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import pycaret
from pycaret.regression import *

In [None]:
import seaborn as sns

In [None]:
train = pd.read_csv('/kaggle/input/tmdb-box-office-prediction/train.csv')
test = pd.read_csv('/kaggle/input/tmdb-box-office-prediction/test.csv')
submission = pd.read_csv("/kaggle/input/tmdb-box-office-prediction/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
test.loc[test['release_date'].isnull()==True, 'release_date']= '5/1/00'
test[test["release_date"]== '5/1/00']

In [None]:
train['release_date'] = pd.to_datetime(train['release_date'], format='%m/%d/%y')
test['release_date'] = pd.to_datetime(test['release_date'], format='%m/%d/%y')


In [None]:
train["release_year"] = pd.to_datetime(train["release_date"]).dt.year.astype(int)
train["release_day"] = pd.to_datetime(train["release_date"]).dt.dayofweek.astype(int)
train["release_month"] = pd.to_datetime(train["release_date"]).dt.month.astype(int)
test["release_year"] = pd.to_datetime(test["release_date"]).dt.year.astype(int)
test["release_day"] = pd.to_datetime(test["release_date"]).dt.dayofweek.astype(int)
test["release_month"] = pd.to_datetime(test["release_date"]).dt.month.astype(int)

In [None]:
train['release_year'].max()

In [None]:
train['release_year'] = train['release_year'].apply(lambda x: (x - 100) if x > 2019 else x)
test['release_year'] = test['release_year'].apply(lambda x: (x - 100) if x > 2019 else x)

In [None]:
train['genres'] = train['genres'].fillna("none")
train['original_language'] = train['original_language'].fillna("none")
train['spoken_languages'] = train['spoken_languages'].fillna("none")
train['status'] = train['status'].fillna("none")
train['production_countries'] = train['production_countries'].fillna("none")
train['production_companies'] = train['production_companies'].fillna("none")

test['genres'] = test['genres'].fillna("none")
test['original_language'] = test['original_language'].fillna("none")
test['spoken_languages'] = test['spoken_languages'].fillna("none")
test['status'] = test['status'].fillna("none")
test['production_countries'] = test['production_countries'].fillna("none")
test['production_companies'] = test['production_companies'].fillna("none")

In [None]:
train.head()

In [None]:
def get_dictionary(s):
    try:
        d = eval(s)
    except:
        d = {}
    return d

In [None]:
from sklearn import preprocessing

In [None]:
train.genres = train.genres.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
train.spoken_languages = train.spoken_languages.map(lambda x: sorted([d['iso_639_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
train.production_companies = train.production_companies.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
train.production_countries = train.production_countries.map(lambda x: sorted([d['iso_3166_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))

In [None]:
test.genres = test.genres.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
test.spoken_languages = test.spoken_languages.map(lambda x: sorted([d['iso_639_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
test.production_companies = test.production_companies.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
test.production_countries = test.production_countries.map(lambda x: sorted([d['iso_3166_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
for c in ['genres', 'production_companies', 'production_countries', 'spoken_languages']:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[c].fillna('').astype(str)) + list(test[c].fillna('').astype(str)))
    train[c] = lbl.transform(train[c].fillna(''))
    test[c] = lbl.transform(test[c].fillna(''))
    print(c, len(lbl.classes_))

In [None]:
# Simple encoding
train['status'] = train['status'].astype('category')
train['status'] = train['status'].cat.codes
test['status'] = test['status'].astype('category')
test['status'] = test['status'].cat.codes

train['original_language'] = train['original_language'].astype('category')
train['original_language'] = train['original_language'].cat.codes
test['original_language'] = test['original_language'].astype('category')
test['original_language'] = test['original_language'].cat.codes

# Replacing 0 Budget values with the median value
train['budget']= train['budget'].replace(0, train['budget'].median())
test['budget']= test['budget'].replace(0, test['budget'].median())

In [None]:
fig, ax = plt.subplots(3, 1, tight_layout=True)
plt.grid()

train.groupby('release_year')['revenue'].mean().plot(ax=ax[0], figsize=(10, 10), linewidth=3).set_title('Revenue Based on Release Year', fontweight="bold")
ax[0].grid()

train.groupby('release_day')['revenue'].mean().plot(ax=ax[1], figsize=(10, 10), linewidth=3).set_title('Revenue Based on Release Day', fontweight="bold")
ax[1].grid()

train.groupby('release_month')['revenue'].mean().plot(ax=ax[2], figsize=(10, 10), linewidth=3).set_title('Revenue Based on Release Month', fontweight="bold")
ax[2].grid()


fig.tight_layout(pad=2.0)
plt.show()

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10), tight_layout=True)

train.plot(ax=ax[0][0], x='budget', y='revenue', style='o', ylabel= 'revenue', color='green').set_title('Revenue & Budget', fontweight="bold")
ax[0][0].grid()

train.plot(ax=ax[1][0], x='popularity', y='revenue', style='o', ylabel= 'revenue').set_title('Revenue & Popularity', fontweight="bold")
ax[1][0].grid()

train.plot(ax=ax[0][1], x='budget', y='popularity', style='o', ylabel= 'popularity', color='slateblue').set_title('Popularity & Budget', fontweight="bold")
ax[0][1].grid()

train.plot(ax=ax[1][1], x='release_year', y='popularity', style='o', ylabel= 'popularity', color='salmon').set_title('Popularity & Release Year', fontweight="bold")
ax[1][1].grid()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5), tight_layout=True)

train.hist(ax=ax[0], column='revenue', bins=25, grid=False, figsize=(8,5), color='green', zorder=2, rwidth=0.9)
ax[0].grid()

train.hist(ax=ax[1], column='release_year', bins=25, grid=False, figsize=(8,5), color='blue', zorder=2, rwidth=0.9)
ax[1].grid()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5), tight_layout=True)

train.boxplot(ax=ax[0], column=['revenue', 'budget']).set_title('Boxplots of Revenue and Budget', fontweight="bold") 
train.boxplot(ax=ax[1], column=['popularity']).set_title('Boxplot of Popularity', fontweight="bold") 

In [None]:
train

In [None]:
train.head()

In [None]:
train.production_countries                  

In [None]:
train.info()

In [None]:
reg = setup(data = train, 
             target = 'revenue',
             numeric_imputation = 'mean',
             ignore_features = ['belongs_to_collection', 'homepage', 'original_title', 'overview', 'poster_path',
                               'release_date', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'imdb_id'],
             normalize = True,
             silent = True)

In [None]:
compare_models(include = ['rf'])

In [None]:
rf = create_model('rf')

In [None]:
tuned_model = tune_model(rf)

In [None]:
predictions = predict_model(tuned_model, data = test)

In [None]:
predictions

In [None]:
pred = pd.DataFrame(predictions.Label)

In [None]:
pred

In [None]:
datasets = pd.concat([submission['id'], pred], axis=1)
datasets.columns =['id', 'revenue']
datasets.to_csv('submission.csv', index=False)

In [None]:
datasets