In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

In [None]:
train_df = pd.read_csv("/kaggle/input/tmdb-box-office-prediction/train.csv")
test_df = pd.read_csv("/kaggle/input/tmdb-box-office-prediction/test.csv")
train_df.sample(5)

In [None]:
train_df.isna().sum()

In [None]:
train_df.shape

# 1. Budget

In [None]:
train_df.budget[:10]

In [None]:
train_df.budget = train_df.budget.replace(0,train_df.budget.median())
train_df.budget[:10]

In [None]:
train_df.budget.corr(train_df.revenue)

In [None]:
# lets get to know the distribution of the Age
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(24,6)
sns.distplot(train_df.budget)

# 2. Generes

In [None]:
train_df.genres[0:10]

In [None]:
def get_dictionary(s):
    try:
        d = eval(s)
    except:
        d = {}
    return d

train_df['genres_name'] = train_df['genres'].apply(lambda x: sorted([d['name'] for d in get_dictionary(x)])).apply(lambda x: ','.join(x))

genres = train_df.genres_name.str.get_dummies(sep=',')
genres.head()


In [None]:
train_df = pd.concat([train_df, genres], axis =1)

# 3. Original_title

In [None]:
train_df.original_title[:10]

In [None]:
train_df['original_title_len'] = train_df.original_title.apply(lambda x: len(x) )
train_df.head(5)

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(24,6)
sns.distplot(train_df.original_title_len)

# 4. Release Date

In [None]:
train_df.release_date[:10]

In [None]:
train_df["Year"] = train_df.release_date.apply(lambda x: int(x.split('/')[2]))
train_df.loc[((train_df["Year"] <=19) & (train_df["Year"] <100)), "Year"] += 2000
train_df.loc[((train_df["Year"] >19) & (train_df["Year"] <100)), "Year"] += 1900

train_df.sample(5)

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(24,6)
sns.distplot(train_df.Year)

# 5. Runtime

In [None]:
train_df.runtime[:10]

In [None]:
train_df.runtime = train_df.runtime.fillna(train_df.runtime.mean())

In [None]:

sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(24,6)
sns.distplot(train_df.runtime)

# 6. Status

In [None]:
set(train_df.status)

In [None]:
objects = set(train_df.status)
y_pos = np.arange(len(objects))
performance = [((train_df.status == 'Released').sum()), ((train_df.status == 'Rumored').sum())]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Count')
plt.title('Genres')

plt.show()

# 7. Tagline

In [None]:
train_df.tagline[:10]

In [None]:
train_df['tagline_boolean'] = train_df.tagline.isna()

In [None]:
set(train_df['tagline_boolean'])

In [None]:
objects = set(train_df['tagline_boolean'])
y_pos = np.arange(len(objects))
performance = [((train_df['tagline_boolean'] == False).sum()), ((train_df['tagline_boolean']).sum())]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Count')
plt.title('Is There a tagline?')

plt.show()

# 8. Title

In [None]:
train_df.title[:10]

In [None]:
train_df["sequel"] = train_df.title.apply(lambda x: bool(re.search(r'\d', x)) )

In [None]:
set(train_df['sequel'])

In [None]:
objects = set(train_df['sequel'])
y_pos = np.arange(len(objects))
performance = [((train_df['sequel'] == False).sum()), ((train_df['sequel']).sum())]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Count')
plt.title('Is it a sequel?')

plt.show()

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
train_df.head(5)

In [None]:
train_df.columns

In [None]:
Selcol = ['budget', 'original_language', 'runtime', 'original_title_len', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History',
       'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie',
       'Thriller', 'War', 'Western', 'tagline_boolean', 'sequel', 'Year']

In [None]:
# label encoder
le = LabelEncoder()
obj_columns = [col for col in train_df[Selcol].select_dtypes(include = ['object'])]

In [None]:
# applying label encoder
for col in obj_columns:
    train_df[col] = le.fit_transform(train_df[col])


# Modelling

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model = model.fit(train_df[Selcol], train_df["revenue"])

In [None]:
train_df['predicted_revenue'] = pd.DataFrame(model.predict(train_df[Selcol]))

In [None]:
train_df["error_square"] = (  train_df["revenue"] -   train_df['predicted_revenue'] )**2

In [None]:
((train_df["error_square"].mean())**0.5)

In [None]:
from sklearn import metrics
metrics.r2_score(train_df["revenue"], train_df["predicted_revenue"])