##                                                     Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import scipy.stats

## Read data

In [2]:
train_df = pd.read_csv('data.csv') #read original train data
test_df = pd.read_csv('test.csv') #read original test data

## Functions


In [3]:
def drop_columns(df, column_names):
    """
    
    df: input dataframe
    column_names: list of column's name
    return: dataframe with dropped columns
     
    """
    new_df = df.copy(deep=True)
    new_df.drop(column_names, axis=1, inplace=True)
    return new_df


# Preprocessing

### Convert Date feature to Month and Day

In [4]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Month'] = train_df.Date.dt.month
train_df['Day'] = train_df.Date.dt.day

test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df['Month'] = train_df.Date.dt.month
test_df['Day'] = train_df.Date.dt.day

In [5]:
column_names = ['Unnamed: 0','Date', 'Start_time', 'End_time','Name of show', 'Name of episode']

### We have NaN values on "Temperature in Montreal during episode" features..I choose linear Interpolate to fill NaN values

In [6]:
train_df['Temperature in Montreal during episode'].interpolate(inplace=True)
test_df['Temperature in Montreal during episode'].interpolate(inplace=True)

## Label Encoding with simple label encoder

In [7]:
temp_train_df = drop_columns(train_df, column_names)
temp_test_df = drop_columns(test_df, column_names)


train_target_df = temp_train_df['Market Share_total']
train_df = temp_train_df.copy(deep=True)
train_df.drop(['Market Share_total'], axis=1, inplace=True)


test_df = temp_test_df.copy(deep=True)

all_data = pd.concat([train_df, test_df], keys=['train', 'test'])

le = preprocessing.LabelEncoder()

for item in train_df.loc[:, ~train_df.columns.isin(['Temperature in Montreal during episode','Year', 'Length', 'Month', 'Day'])]:
    
    all_data[item] = le.fit_transform(all_data[item]) + 1 # use label encoding for both train and test data


test_df = all_data.xs('test')
train_df = all_data.xs('train')


### Normalize our data

In [8]:
scaler = StandardScaler()
Normalized_train_arr = scaler.fit_transform(train_df)
Normalized_test_arr = scaler.transform(test_df) #use same fit to train for test
Normalized_train_target_arr = scaler.fit_transform(train_target_df.values.reshape(-1,1))



## Train and Test

In [9]:
train_X = Normalized_train_arr
train_Y = Normalized_train_target_arr
    
test_X = Normalized_test_arr
    
regr = RandomForestRegressor(n_estimators=12, random_state=0, n_jobs=-1)
regr.fit(train_X, train_Y)
pred_y = regr.predict(test_X)


  import sys


In [10]:
results = scaler.inverse_transform(pred_y.reshape(-1,1))
np.savetxt("results.csv", results, delimiter=",", header="Market Share_total")