##                                                     Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import scipy.stats

## Read data

In [2]:
train_df = pd.read_csv('data.csv') #read original train data

## Functions


In [3]:
def drop_columns(df, column_names):
    """
    
    df: input dataframe
    column_names: list of column's name
    return: dataframe with dropped columns
     
    """
    new_df = df.copy(deep=True)
    new_df.drop(column_names, axis=1, inplace=True)
    return new_df



# Preprocessing

### 1- First discover our data generally

In [4]:
train_df.head(-5)

Unnamed: 0.1,Unnamed: 0,Episode,Station,Channel Type,Season,Year,Date,Day of week,Start_time,End_time,Length,Name of show,Name of episode,Genre,First time or rerun,# of episode in the season,Movie?,Game of the Canadiens during episode?,Market Share_total,Temperature in Montreal during episode
0,1,Vidéoclips V,V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 06:00:00,2016-08-29 08:00:00,8,Vidéoclips V,,Music Video Clips,No,Yes,No,No,0.9,20.400
1,2,Apollo dans l'frigo,V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 08:00:00,2016-08-29 08:30:00,2,Apollo dans l'frigo,,Informal Education and Recreation and Leisure,No,Yes,No,No,0.5,19.125
2,3,Infopublicité,V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 08:30:00,2016-08-29 09:00:00,2,Infopublicité,,"Infomercials, Promotional and Corporate Videos",No,Yes,No,No,0.3,19.125
3,4,"Infos, Les",V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 09:00:00,2016-08-29 10:00:00,4,"Infos, Les",,News,No,Yes,No,No,1.7,18.125
4,5,"Souper presque parfait, Un",V Total,General Channel,Fall,2016,2016-08-29,Monday,2016-08-29 10:00:00,2016-08-29 10:30:00,2,"Souper presque parfait, Un",,Reality Programs,No,Yes,No,No,2.2,18.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616646,616647,Structures abandonnées,Canal D+,Specialty Channel,Winter,2019,2019-01-22,Tuesday,2019-01-22 01:00:00,2019-01-22 02:00:00,4,Structures abandonnées,Ambitions dépassées,Long-form Documentary,No,Yes,No,No,5.8,-18.575
616647,616648,VRAK la vie (M),VRAK+,Specialty Channel,Winter,2019,2019-01-22,Tuesday,2019-01-22 06:00:00,2019-01-22 06:30:00,2,VRAK la vie (M),La fin du monde,Ongoing Comedy Series (Sitcoms),No,Yes,No,No,0.2,-21.300
616648,616649,VRAK la vie (M),VRAK+,Specialty Channel,Winter,2019,2019-01-22,Tuesday,2019-01-22 06:30:00,2019-01-22 07:00:00,2,VRAK la vie (M),Le parcours de bonbons maudit,Ongoing Comedy Series (Sitcoms),No,Yes,No,No,0.5,-21.300
616649,616650,VRAK la vie (M),VRAK+,Specialty Channel,Winter,2019,2019-01-22,Tuesday,2019-01-22 07:00:00,2019-01-22 07:30:00,2,VRAK la vie (M),La fanatique,Ongoing Comedy Series (Sitcoms),No,Yes,No,No,0.7,-21.775


#### we have 20 features and 616656 samples

In [5]:
train_df.shape

(616656, 20)

### Check number of unique values of features

In [6]:
for col in train_df:
    print("column:{}".format(str(col)) + " ---------> " + str(len(train_df[col].unique())))

column:Unnamed: 0 ---------> 616656
column:Episode ---------> 6687
column:Station ---------> 24
column:Channel Type ---------> 2
column:Season ---------> 4
column:Year ---------> 4
column:Date ---------> 877
column:Day of week ---------> 7
column:Start_time ---------> 138322
column:End_time ---------> 138334
column:Length ---------> 39
column:Name of show ---------> 6687
column:Name of episode ---------> 86557
column:Genre ---------> 27
column:First time or rerun ---------> 2
column:# of episode in the season ---------> 2
column:Movie? ---------> 2
column:Game of the Canadiens during episode? ---------> 2
column:Market Share_total ---------> 545
column:Temperature in Montreal during episode ---------> 11829


## features with NaN samples

In [7]:
train_df.columns[train_df.isna().any()].tolist()

['Start_time',
 'End_time',
 'Name of episode',
 'Temperature in Montreal during episode']

## 2- pirmary feature selection

### Episode and Name of show are same features! then choose one of them

In [8]:
# check equality of two columns
assert sum(train_df['Episode'] == train_df['Name of show'][:]) == train_df.shape[0] , "Columns are not same"

### difference of end time and start time are high correlated with Length (using Pearson Correlation Coefficient about 99.5%), also we have NaN values in start time and end time, then we can remove start time and end time features from data

In [9]:
df = train_df.copy()
df = df[df['Start_time'].notna()]
df['Start_time'] = pd.to_datetime(df['Start_time'])
df['End_time'] = pd.to_datetime(df['End_time'])
df['Time_diff'] = (df['End_time'] - df['Start_time'])
df['Time_diff'] = df['Time_diff'].dt.seconds /3600

print(scipy.stats.pearsonr(df["Time_diff"],df["Length"] ))
del df

(0.9949798563110538, 0.0)


### Convert Date feature to Month and Day

In [10]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Month'] = train_df.Date.dt.month
train_df['Day'] = train_df.Date.dt.day

### 36% of a "Name of episode" feature is NaN and we can not use interpolate or some other approach...we can drop it!

In [11]:
100 * train_df[(train_df['Name of episode'].isnull())].shape[0]/len(train_df) 

36.09451622946992

### Conclution: These features should be dropped

In [12]:
column_names = ['Unnamed: 0','Date', 'Start_time', 'End_time','Name of show', 'Name of episode']

### We have NaN values on "Temperature in Montreal during episode" features..I choose linear Interpolate to fill NaN values

In [13]:
train_df['Temperature in Montreal during episode'].interpolate(inplace=True)

## Label Encoding with simple label encoder

In [14]:
temp_train_df = drop_columns(train_df, column_names)
# temp_test_df = drop_columns(test_df, column_names)


train_target_df = temp_train_df['Market Share_total']
train_df = temp_train_df.copy(deep=True)
train_df.drop(['Market Share_total'], axis=1, inplace=True)

# test_target_df = new_test_df['Market Share_total']
# test_df = new_test_df.copy(deep=True)
# test_df.drop(['Market Share_total'], axis=1, inplace=True)

le = preprocessing.LabelEncoder()

for item in train_df.loc[:, ~train_df.columns.isin(['Temperature in Montreal during episode','Year', 'Length', 'Month', 'Day'])]:
    
    train_df[item] = le.fit_transform(train_df[item]) + 1

### Normalize our data

In [15]:
scaler = StandardScaler()
Normalized_train_arr = scaler.fit_transform(train_df)
Normalized_train_target_arr = scaler.fit_transform(train_target_df.values.reshape(-1,1))

## Use shuffle split train and test data: 70% for train and 30% for validation data
## Choose RandomForest Regressor model with 12 estimator for our data
## Metrics are R square and MAE

In [16]:
ss = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
preds = []
reals = []
r2_scores_list = []
mae_list = []
pcc = []
spc = []

for train_index, val_index in ss.split(Normalized_train_arr):
    
    train_X = Normalized_train_arr[train_index]
    train_Y = Normalized_train_target_arr[train_index]
    
    validation_X = Normalized_train_arr[val_index]
    validation_y = Normalized_train_target_arr[val_index]
    
    regr = RandomForestRegressor(n_estimators=12, random_state=0, n_jobs=-1)
    regr.fit(train_X, train_Y)
    pred_y = regr.predict(validation_X)
    # Model Metrics Calculation
    r2_scores_list.append(regr.score(validation_X, validation_y))
    mae_list.append(mean_absolute_error(scaler.inverse_transform(validation_y), scaler.inverse_transform(pred_y.reshape(-1,1))))
    # Pearson Correlation Coefficient and Spearman Correlation Coefficient Calculations
    pcc.append(scipy.stats.pearsonr(pred_y, validation_y.ravel())[0])
    spc.append(scipy.stats.spearmanr(pred_y, validation_y.ravel())[0])
    



### Calculate Mean value of model metrics and statistical metrics

In [17]:
print("R Square mean value: ", str(np.mean(r2_scores_list)))
print("MAE mean value: ", str(np.mean(mae_list)))
print("Pearson Correlation Coefficient: ", str(np.mean(pcc)))
print("Spearman Correlation Coefficient: ", str(np.mean(spc)))

R Square mean value:  0.8566461292615359
MAE mean value:  1.10838557054771
Pearson Correlation Coefficient:  0.9258654084141197
Spearman Correlation Coefficient:  0.8056042127167841
