In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 100

## 1. Data Overview

In [None]:
data = pd.read_csv('TVdata.txt')

In [None]:
data.head(5)

In [None]:
print("The dataset contains {0} rows and {1} columns".format( data.shape[0], data.shape[1]))

### 1.1 Check if there is duplicated data

In [None]:
any(data.duplicated('video_id'))

In [None]:
data.columns

In [None]:
data.drop('video_id',axis=1).describe(percentiles=[0.1,0.25,0.5,0.75,0.9,1.0])

In [None]:
num_cols = ['cvt_per_day', 'weighted_categorical_position','weighted_horizontal_poition',\
           'release_year', 'imdb_votes', 'budget', 'boxoffice', 'imdb_rating', 'duration_in_mins',\
            'metacritic_score','star_category']
str_cols = ['import_id','genres', 'awards', 'mpaa']

### 1.2 Distribution of numerical columns

In [None]:
fig, ax = plt.subplots(nrows=4,ncols=3, figsize=(16,16))
for i in range(len(num_cols)):
    col = num_cols[i]
    sns.distplot(data[col], kde= False, ax = ax[i//3][i%3])    

### 1.3 Finer plot of 'cvt_perday'

In [None]:
sns.distplot(data['cvt_per_day'], kde= False, bins = range(0,15000,50), color='red') 

### 1.3 Correlations

In [None]:
sns.heatmap(data[num_cols].corr())

### 1.4 Zero counts

In [None]:
(data == 0).sum()

In [None]:
# df = data[(data != 0).all(1)]
# sns.heatmap(df[num_cols].corr())

In [None]:
# fig, ax = plt.subplots(nrows=4,ncols=3, figsize=(12,14))
# for i in range(len(num_cols)):
#     col = num_cols[i]
#     sns.distplot(df[col], kde= False, ax = ax[i//3][i%3])  

### 1.5 Categorical columns

In [None]:
print(data['import_id'].value_counts())
sns.stripplot(x="import_id", y="cvt_per_day", data=data, jitter=True)

In [None]:
print(data['awards'].value_counts())
sns.stripplot(x="awards", y="cvt_per_day", data=data, jitter=True)

In [None]:
print(data['mpaa'].value_counts())
sns.stripplot(x="mpaa", y="cvt_per_day", data=data, jitter=True)

In [None]:
genres = list(set([g for v in data['genres'].str.split(',').values for g in v]))
genres

In [None]:
data = pd.get_dummies(data, columns = ['import_id', 'awards', 'mpaa'])

In [None]:
data.head(5)

In [None]:
df = data.drop('genres',axis=1)

In [None]:
df.head(5)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regr = RandomForestRegressor(max_depth=5, random_state=0,
                             n_estimators=100)

In [None]:
y = df['cvt_per_day']
X = df.drop('cvt_per_day',axis=1)

In [None]:
regr.fit(X, y)

In [None]:
print(regr.feature_importances_)