In [1]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout
from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from matplotlib import pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import math
import time
import random
import warnings
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge, Lars
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.kernel_ridge import KernelRidge
warnings.filterwarnings('ignore')

Using TensorFlow backend.


## Load data

In [2]:
data = pd.read_csv("s3://ai-diennea/data/export_wonkit_20210630102441.csv.gz")
# convert datetime format
data['EVENT.DATE'] = pd.to_datetime(data['EVENT.DATE'], format='%Y/%m/%d %H:%M')
# add the day of the week column
day_of_week = []
for i in range(len(data)):
    day_of_week.append(data["EVENT.DATE"][i].day_name())
data['day_of_week'] = pd.DataFrame(day_of_week)

df = pd.read_csv("s3://ai-diennea/data/df.csv")
df = df.drop(['Unnamed: 0'], axis=1)
X = pd.read_csv("s3://ai-diennea/data/X.csv")
X = X.drop(['Unnamed: 0'], axis=1)
y = pd.read_csv("s3://ai-diennea/data/y.csv")
y = y.drop(['Unnamed: 0'], axis=1)

In [20]:
sent_open_hour_range = 36
open_click_hour_range = 24

def exp_decay_fit(x, sent_open_hour_range):
    '''Return a value from 0 to 1 following an exponential decreasing function.'''
    if x > sent_open_hour_range*60:
        return .0
    if x < 0:
        return 1
    return math.exp(15*((-math.log(2)/(sent_open_hour_range*60))*x) + math.log(2))/2

def get_all_indexes(hash_mex, hash_contact, data):
    '''Given the hash message, hash contact and raw data, return the indexes
    for the hash message and hash contact in the raw data.ù'''
    return data.index[(data['HashMessaggio'] == hash_mex) & (data['HashContatto'] == hash_contact)]


def from_min_to_hour_and_min(mins):
    '''Given the minutes, return a string that represents the hour and minutes.'''
    hours = int(round(mins)) // 60
    minutes = int(round(mins)) % 60
    return "{}:{}".format(hours, minutes)

def evaluate(df, X, data, sent_open_hour_range, preds, never_opened=True):
    '''Given the prediction of the model "preds" returns:
    - how many fitSA are better than the ground truth fitSA
    - how many fitSA are equal than the ground truth fitSA
    - how many fitSA are worse than the ground truth fitSA.
    If never_opened == True, then consider also the sent mails that have never been opened. Not otherwise.
    '''
    # from [0, 1] to mins
    for i in range(len(preds)):
        preds[i] *= 24*60 # this 24*60 is requierd to convert mins from [0, 1] into real mins
    # get fitSA using the predicted sent
    fitSA_preds = []
    fitSC_preds = []
    for i in range(len(df)):
        fitSA_preds.append(compute_fitSA_evaluation(df['HashMessaggio'][i], df['HashContatto'][i], sent_open_hour_range, data, preds[i], never_opened=never_opened))
        fitSC_preds.append(compute_fitSC_evaluation(df['HashMessaggio'][i], df['HashContatto'][i], sent_open_hour_range, data, preds[i], never_opened=never_opened))
    
    total_mex = len(fitSA_preds)
    predicted_sent_better_usual_sent = 0
    predicted_sent_equal_usual_sent = 0
    predicted_sent_worst_usual_sent = 0
    if never_opened == True:
        for i in range(total_mex):
            if fitSA_preds[i] > X.loc[i, 'fitSA']:
                predicted_sent_better_usual_sent += 1
            elif fitSA_preds[i] == X.loc[i, 'fitSA']:
                predicted_sent_equal_usual_sent += 1
            elif fitSC_preds[i] > X.loc[i, 'fitAC']:
                predicted_sent_better_usual_sent += 1
            elif fitSC_preds[i] == X.loc[i, 'fitAC']:
                predicted_sent_equal_usual_sent += 1
            else:
                predicted_sent_worst_usual_sent += 1 # case fitSC predetta < fitAC reale
        return predicted_sent_better_usual_sent/total_mex, predicted_sent_equal_usual_sent/total_mex, predicted_sent_worst_usual_sent/total_mex
    else:
        total_mex_different_from_mins_one = len([i for i in fitSA_preds if i > -1])
        for i in range(total_mex):
            if fitSA_preds[i] != -1 and fitSA_preds[i] > X.loc[i, 'fitSA']:
                predicted_sent_better_usual_sent += 1
            elif fitSA_preds[i] != -1 and fitSA_preds[i] == X.loc[i, 'fitSA']:
                predicted_sent_equal_usual_sent += 1
            elif fitSC_preds[i] != -1 and fitSC_preds[i] > X.loc[i, 'fitAC']:
                predicted_sent_better_usual_sent += 1
            elif fitSC_preds[i] != -1 and fitSC_preds[i] == X.loc[i, 'fitAC']:
                predicted_sent_equal_usual_sent += 1
            elif fitSA_preds[i] != -1 or fitSC_preds[i] != -1:
                predicted_sent_worst_usual_sent += 1 # case fitSC predetta < fitAC reale    
        return predicted_sent_better_usual_sent/total_mex_different_from_mins_one, predicted_sent_equal_usual_sent/total_mex_different_from_mins_one, predicted_sent_worst_usual_sent/total_mex_different_from_mins_one
    
def compute_fitSA_evaluation(hash_mex, hash_contact, sent_open_hour_range, data, sent_pred, never_opened=True):
    '''Given the hash message, hash contact, raw data and the predicted sent hour,
    return the fitSA for that contact and message.'''
    sent_pred = from_min_to_hour_and_min(sent_pred)
    df2 = data[(data['HashMessaggio'] == hash_mex)]
    df3 = df2[(df2['HashContatto'] == hash_contact)]
    df4 = df3[(df3['EVENT.TYPE'] == 'Open')]
    opens = list(df4['EVENT.DATE'])
    df5 = df3[(df3['EVENT.TYPE'] == 'Click')]
    clicks = list(df5['EVENT.DATE'])
    
    oldest = None
    if opens != []:
        for i in opens:
            if oldest is None:
                oldest = i
            elif i < oldest:
                oldest = i
    elif clicks != []:
        for i in clicks:
            if oldest is None:
                oldest = i
            elif i < oldest:
                oldest = i
    else: # this means that the mail has never been opened/clicked
        if never_opened == True:
            return 0
        else:
            return -1
    oldest = str(oldest.hour) +":"+ str(oldest.minute)
    oldest = pd.to_datetime(oldest, format='%H:%M')
    sent_pred = pd.to_datetime(sent_pred, format='%H:%M')
    # compute minutes of the distance between sent-open/sent-click
    mins = ((oldest - sent_pred).days*24*60) + ((oldest - sent_pred).seconds//3600)*60 + ((oldest - sent_pred).seconds//60)%60
    return exp_decay_fit(mins, sent_open_hour_range)

def compute_fitSC_evaluation(hash_mex, hash_contact, open_click_hour_range, data, sent_pred, never_opened=True):
    '''Given the message, the hash contact and the data (raw data), return the fitAC
    for that hash message and hash contact.'''
    sent_pred = from_min_to_hour_and_min(sent_pred)
    df2 = data[(data['HashMessaggio'] == hash_mex)]
    df3 = df2[(df2['HashContatto'] == hash_contact)]
    df4 = df3[(df3['EVENT.TYPE'] == 'Click')]
    clicks = list(df4['EVENT.DATE'])
    
    oldest_click = None

    if clicks == []: # covers cases when a message is sent and is never open and never clicked
        if never_opened == True:
            return 0
        else:
            return -1
    else:
        for i in clicks: # get oldest click
            if oldest_click is None:
                oldest_click = i
            elif i < oldest_click:
                oldest_click = i
    # compute minutes of the distance between sent pred-click
    oldest_click = str(oldest_click.hour) +":"+ str(oldest_click.minute)
    #print(oldest_click)
    oldest_click = pd.to_datetime(oldest_click, format='%H:%M')
    sent_pred = pd.to_datetime(sent_pred, format='%H:%M')
    # compute minutes of the distance between sent-open/sent-click
    mins = ((oldest_click - sent_pred).days*24*60) + ((oldest_click - sent_pred).seconds//3600)*60 + ((oldest_click - sent_pred).seconds//60)%60
    return exp_decay_fit(mins, sent_open_hour_range)

def split_train_test_by_lifetime(X, df, data, test_size, random_seed):
    d7 = {}
    for i in data["HashContatto"].unique():
        if i not in d7:
            d7[i] = []
    for i in range(len(data)):
        if "nan" not in str(data["EVENT.DATE"][i]) and (str(data["EVENT.TYPE"][i]) == 'Open' or str(data["EVENT.TYPE"][i]) == 'Click'):
            d7[data["HashContatto"][i]].append(data["EVENT.DATE"][i])
    # Here I merge in the same category (assign 0 days) who never opened with the users that opened just once
    for i in data["HashContatto"].unique():
        if len(d7[i]) == 0 or len(d7[i]) == 1:
            d7[i] = 0 # 0 days as lifetime
        else:
        # retain newest and oldest date
            newest_date = d7[i][0] # get the first date
            oldest_date = d7[i][0] # get the first date
            for j in d7[i]:
                if j > newest_date:
                    newest_date = j
                if j < oldest_date:
                    oldest_date = j
            # assign the lifetitme for the contact i
            d7[i] = (newest_date - oldest_date).days
    df['Lifetime'] = 0
    for i in range(len(df)):
        df.loc[i, 'Lifetime'] = d7[df['HashContatto'][i]]
    lt = df['Lifetime'].to_numpy()
    zero = []
    today = []
    between = []
    for i in range(len(lt)):
        if lt[i] == 0:
            zero.append(i)
        elif lt[i] >= 320: # TODO: here I assume that a user still open today whether his lifetime is greater than or equal than 320 (it means that the last time he opened is 1 month ago)
            today.append(i)
        else:
            between.append(i)
            
    random.Random(random_seed).shuffle(zero) # 39%
    random.Random(random_seed).shuffle(today) # 17%
    random.Random(random_seed).shuffle(between) # 43%
    
    percentage_zero_train = round(len(zero) - (test_size * len(zero)))
    percentage_today_train = round(len(today) - (test_size * len(today)))
    percentage_between_train = round(len(between) - (test_size * len(between)))
    
    train_indexes_zero = zero[:percentage_zero_train]
    test_indexes_zero = zero[percentage_zero_train:]

    train_indexes_today = today[:percentage_today_train]
    test_indexes_today = today[percentage_today_train:]

    train_indexes_between = between[:percentage_between_train]
    test_indexes_between = between[percentage_between_train:]
    
    X_train = X.iloc[train_indexes_zero + train_indexes_today + train_indexes_between, :]
    y_train = df.iloc[train_indexes_zero + train_indexes_today + train_indexes_between, :]['Label']
    X_test = X.iloc[test_indexes_zero + test_indexes_today + test_indexes_between, :]
    y_test = df.iloc[test_indexes_zero + test_indexes_today + test_indexes_between, :]['Label']
    
    df_train = df.iloc[train_indexes_zero + train_indexes_today + train_indexes_between, :]
    df_test = df.iloc[test_indexes_zero + test_indexes_today + test_indexes_between, :]
    return X_train, X_test, y_train, y_test, df_train, df_test    


## Algoritmi di learning

### Random Forest regression

Suddivisione **80/20**

In [4]:
# train set
df_train, df_test = train_test_split(df, test_size=0.34, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
# test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [None]:
rf = RandomForestRegressor(random_state=42, n_estimators=200, max_features='auto', bootstrap=True, min_samples_split=2, min_samples_leaf=2, oob_score=True, n_jobs=-1)
rf.fit(X_train, y_train)

X_test_fit = X_test.copy()
X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
preds = rf.predict(X_test_fit)

start_time = time.time()
out = evaluate(df_test, X_test, data, sent_open_hour_range, preds)
print("--- %s seconds ---" % (time.time() - start_time))
print(out)

--- 296.559374332428 seconds ---
(0.07084616778401884, 0.9222685269070484, 0.006885305308932777)


### k-fold

In [120]:
M = X.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    rf = RandomForestRegressor(n_estimators=200, max_features='auto', bootstrap=True, min_samples_split=2, min_samples_leaf=2, oob_score=True, n_jobs=-1)
    rf.fit(M_train, g_train)

    M_test_fit = M_test.copy()
    M_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    M_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = rf.predict(M_test_fit)
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, preds))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

(0.07186416389534987, 0.9212821022681279, 0.006853733836522267)

### Lifetime

In [4]:
for i in [0.2, 0.3, 0.34]:
    X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=i, random_seed=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)

    rf = RandomForestRegressor(random_state=42, n_estimators=200, max_features='auto', bootstrap=True, min_samples_split=2, min_samples_leaf=2, oob_score=True,n_jobs=-1)
    rf.fit(X_train, y_train)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = rf.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds))

(0.07169259202217773, 0.9225319574926845, 0.005775450485137841)
(0.07289901945685097, 0.9212485240515427, 0.005852456491606346)
(0.07293318233295583, 0.9208607021517554, 0.006206115515288788)


## Linear Regression

In [103]:
# train set
df_train, df_test = train_test_split(df, test_size=0.30, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
# test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [104]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [105]:
X_test_fit = X_test.copy()
X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
preds = model.predict(X_test_fit)

p = []
for i in range(len(preds)):
    p.append(preds[i][0])
evaluate(df_test, X_test, data, sent_open_hour_range, p)

(0.07505518763796909, 0.920889162688023, 0.004055649674007906)

### k-fold

In [122]:
M = X.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    model = LinearRegression()
    model.fit(M_train, g_train)

    M_test_fit = M_test.copy()
    M_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    M_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(M_test_fit)
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, p))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

(0.09230224356421632, 0.9024457914186309, 0.005251965017152739)

### Lifetime

In [None]:
for i in [0.2, 0.3, 0.34]:    
    X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=i, random_seed=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)

    model = LinearRegression()
    model.fit(X_train, y_train)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)
    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds))

(0.0950254119821346, 0.9022793777914677, 0.002695210226397659)
(0.08326916166127625, 0.9124185019764876, 0.004312336362236255)
(0.07941109852774632, 0.9147904869762175, 0.00579841449603624)


## Bayesian regression

In [12]:
# train set
df_train, df_test = train_test_split(df, test_size=0.34, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
# test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

model = BayesianRidge(tol=0.001, lambda_1=1e-9, lambda_2=0.001, alpha_1=0.001, alpha_2=1e-9)
model.fit(X_train, y_train)

X_test_fit = X_test.copy()
X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
preds = model.predict(X_test_fit)

print(evaluate(df_test, X_test, data, sent_open_hour_range, preds))

(0.13988041311831853, 0.8574922993295887, 0.0026272875520927702)


### k-fold

In [123]:
M = X.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    model = BayesianRidge(tol=0.001, lambda_1=1e-9, lambda_2=0.001, alpha_1=0.001, alpha_2=1e-9)
    model.fit(M_train, g_train)

    M_test_fit = M_test.copy()
    M_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    M_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(M_test_fit)
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, preds))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

(0.09636883558319168, 0.898671863736908, 0.004959300679900472)

### Lifetime

In [13]:
for i in [0.2, 0.3, 0.34]:
    X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=i, random_seed=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)

    model = BayesianRidge(tol=0.001, lambda_1=1e-9, lambda_2=0.001, alpha_1=0.001, alpha_2=1e-9)
    model.fit(X_train, y_train)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds))

(0.09641152009856768, 0.9008932696750347, 0.002695210226397659)
(0.08403922172596129, 0.9116484419118025, 0.004312336362236255)
(0.07959229898074745, 0.9147451868629671, 0.0056625141562853904)


## Least Angle Regression

In [13]:
# train set
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
# test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [14]:
model = Lars(n_nonzero_coefs=1)
model.fit(X_train, y_train)

Lars(n_nonzero_coefs=1)

In [15]:
X_test_fit = X_test.copy()
X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
preds = model.predict(X_test_fit)

evaluate(df_test, X_test, data, sent_open_hour_range, preds)

(0.1558601570922532, 0.8409055906360696, 0.0032342522716771907)

**never_opened=False**

In [21]:
X_test_fit = X_test.copy()
X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
preds = model.predict(X_test_fit)

evaluate(df_test, X_test, data, sent_open_hour_range, preds, never_opened=False)

(0.6644780039395929, 0.07550886408404466, 0.26001313197636244)

### k-fold

In [125]:
M = X.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    model = Lars(n_nonzero_coefs=1)
    model.fit(M_train, g_train)

    M_test_fit = M_test.copy()
    M_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    M_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(M_test_fit)
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, preds))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

(0.16020828923847436, 0.8366959578103156, 0.003095752951209979)

### Lifetime

In [59]:
for i in [0.20, 0.30, 0.34]:
    X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=i, random_seed=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    
    model = model = Lars(n_nonzero_coefs=1)
    model.fit(X_train, y_train)
    
    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds))

(0.15847836131218235, 0.8385954104420145, 0.002926228245803173)
(0.1609425535191745, 0.836182555572668, 0.002874890908157503)
(0.16117780294450737, 0.835832389580974, 0.002989807474518686)


## Neural Networks

In [14]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               12672     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 177,537
Trainable params: 177,537
Non-trainable params: 0
_________________________________________________________________


In [18]:
for i in [0.2, 0.3, 0.34]:
    # train set
    df_train, df_test = train_test_split(df, test_size=i, random_state=42)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    # test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    NN_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split = 0.2)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = NN_model.predict(X_test_fit)

    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    print(evaluate(df_test, X_test, data, sent_open_hour_range, p))

Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.1591714153703989, 0.8376713383643924, 0.0031572462652086864)
Train on 36359 samples, validate on 9090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.10544689152420555, 0.8884953026336054, 0.006057805842189024)
Train on 34281 samples, validate on 8571 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.09399347707918101, 0.9000724768979887, 0.005934046022830223)


### k-fold

In [127]:
M = X.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    model = NN_model
    model.fit(M_train, g_train, epochs=10, batch_size=32, validation_split = 0.2)

    M_test_fit = M_test.copy()
    M_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    M_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    
    preds = model.predict(M_test_fit)
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, p))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41554 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41554 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


(0.0854323541818472, 0.9075751873066583, 0.006992458511494354)

### Lifetime

In [21]:
for i in [0.2, 0.3, 0.34]:
    X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=i, random_seed=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)

    model = NN_model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split = 0.2)


    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = NN_model.predict(X_test_fit)

    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    print(evaluate(df_test, X_test, data, sent_open_hour_range, p))

Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.09741259818265825, 0.8979670414292314, 0.004620360388110273)
Train on 36359 samples, validate on 9090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.15817033728630833, 0.8397761692078649, 0.002053493505826788)
Train on 34282 samples, validate on 8571 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.07103057757644395, 0.9213590033975085, 0.007610419026047565)


## Regression with CNNs

In [4]:
from PIL import Image
import os, glob, sys, numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout
from keras.layers import Activation, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers
from keras import losses
from keras import backend as K 
import matplotlib.pyplot as plt
import math
from keras.optimizers import SGD, Adam
from keras import metrics
from keras import models, layers, optimizers 

In [5]:
# train set
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
# test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [6]:
rf = RandomForestRegressor(random_state=42, n_estimators=200, max_features='auto', bootstrap=True, min_samples_split=2, min_samples_leaf=2, oob_score=True, n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestRegressor(min_samples_leaf=2, n_estimators=200, n_jobs=-1,
                      oob_score=True, random_state=42)

In [7]:
Z = X.copy()

In [8]:
np.sort(rf.feature_importances_)

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.10222816e-10, 2.08272816e-08, 1.30319865e-07, 2.72612664e-07,
       9.29156849e-07, 1.24821780e-06, 2.52331203e-06, 2.86380571e-06,
       4.80629471e-06, 5.86005945e-06, 6.24028196e-06, 6.93961063e-06,
       7.10098302e-06, 7.26015824e-06, 8.15152144e-06, 9.14754292e-06,
       1.36861135e-05, 1.46800102e-05, 1.97310321e-05, 2.09428692e-05,
       2.10419481e-05, 2.12223362e-05, 2.48352894e-05, 2.54027020e-05,
       2.60013243e-05, 2.63220531e-05, 2.63809668e-05, 2.68271019e-05,
       2.73138541e-05, 2.75358324e-05, 2.89053121e-05, 2.97376040e-05,
       3.21101786e-05, 3.41045631e-05, 3.52949784e-05, 3.63827260e-05,
       3.66880625e-05, 3.69092634e-05, 3.95015576e-05, 4.75049159e-05,
       4.87973581e-05, 4.91780192e-05, 5.25719750e-05, 5.41827669e-05,
       5.42325175e-05, 5.43680386e-05, 5.85326750e-05, 6.37560049e-05,
       6.38048684e-05, 6.40836389e-05, 6.53483900e-05, 8.60669319e-05,
      

In [9]:
len(Z.columns[np.where(rf.feature_importances_ > 7.10098302e-06)])

81

In [10]:
Z.drop(Z.columns.difference(Z.columns[np.where(rf.feature_importances_ > 7.10098302e-06)]), axis=1, inplace=True)

In [28]:
droprate=0.25

model = Sequential()
model.add(Conv2D(32, (3,3), padding="same", input_shape=(9,9,1), activation="relu")) 
model.add(BatchNormalization())
model.add(Dropout(droprate))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(32, (3,3), padding="same", activation="relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(droprate))

model.add(Conv2D(64, (3,3), padding="same", activation="relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(droprate))

model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(1))

def rmsle(y_test, y_pred):
    return K.sqrt(K.mean(K.square(K.log(y_pred) - K.log(y_test))))

model.compile(loss= 'mean_squared_error', optimizer='adam', metrics=[rmsle])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 9, 9, 32)          320       
_________________________________________________________________
batch_normalization_1 (Batch (None, 9, 9, 32)          128       
_________________________________________________________________
dropout_1 (Dropout)          (None, 9, 9, 32)          0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 4, 4, 32)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 4, 4, 32)          9248      
_________________________________________________________________
batch_normalization_2 (Batch (None, 4, 4, 32)          128       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 2, 2, 32)         

In [39]:
# train set
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
# test set
Z_train, Z_test, y_train, y_test = train_test_split(Z, y, test_size=0.3, random_state=42)
Z_train.reset_index(drop=True, inplace=True)
Z_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)


In [40]:
arr = np.array([])
for i in range(len(Z_train)):
    row = Z_train.iloc[i, :].to_numpy().reshape(-1, 1).reshape(9, 9, 1)
    arr = np.append(arr, row)

In [41]:
arr2 = np.array([])
for i in range(len(Z_test)):
    row = Z_test.iloc[i, :].to_numpy().reshape(-1, 1).reshape(9, 9, 1)
    arr2 = np.append(arr2, row)

In [42]:
Z_train_img = arr.reshape(Z_train.shape[0], 9, 9, 1)
Z_test_img = arr2.reshape(Z_test.shape[0], 9, 9, 1)

In [43]:
model.fit(Z_train_img, y_train, batch_size=32, epochs=10, validation_split=0.2)

Train on 36359 samples, validate on 9090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f568a395c88>

In [44]:
Z_test_img_fit = Z_test_img.copy()
Z_test_img_fit[:, 8, 7, 0] = 1 # fitSA
Z_test_img_fit[:, 8, 8, 0] = 1 # fitAC
preds = model.predict(Z_test_img_fit)

In [45]:
preds

array([[0.6296601],
       [0.6286154],
       [0.4848866],
       ...,
       [0.6679155],
       [0.4859561],
       [0.6300905]], dtype=float32)

In [46]:
p = []
for i in range(len(preds)):
    p.append(preds[i][0])
evaluate(df_test, X_test, data, sent_open_hour_range, p)

(0.1873299450690487, 0.8115406335027465, 0.0011294214282047332)

### k-fold

In [129]:
M = Z.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    arr = np.array([])
    for i in range(len(M_train)):
        row = M_train.iloc[i, :].to_numpy().reshape(-1, 1).reshape(9, 9, 1)
        arr = np.append(arr, row)
    
    arr2 = np.array([])
    for i in range(len(M_test)):
        row = M_test.iloc[i, :].to_numpy().reshape(-1, 1).reshape(9, 9, 1)
        arr2 = np.append(arr2, row)
    
    M_train_img = arr.reshape(M_train.shape[0], 9, 9, 1)
    M_test_img = arr2.reshape(M_test.shape[0], 9, 9, 1)
    
    model.fit(M_train_img, g_train, batch_size=32, epochs=10, validation_split=0.2)
    
    M_test_img_fit = M_test_img.copy()
    M_test_img_fit[:, 8, 7, 0] = 1 # fitSA
    M_test_img_fit[:, 8, 8, 0] = 1 # fitAC
    preds = model.predict(M_test_img_fit)
    
    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, p))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41554 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41554 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


(0.11711674330004748, 0.8757062518261869, 0.007177004873765598)

### Lifetime

In [12]:
Z = X.copy()
X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=0.3, random_seed=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [13]:
rf = RandomForestRegressor(random_state=42, n_estimators=200, max_features='auto', bootstrap=True, min_samples_split=2, min_samples_leaf=2, oob_score=True, n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestRegressor(min_samples_leaf=2, n_estimators=200, n_jobs=-1,
                      oob_score=True, random_state=42)

In [14]:
np.sort(rf.feature_importances_)

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       7.52084304e-11, 9.38162876e-09, 1.09693809e-08, 2.69342123e-08,
       8.65996530e-08, 4.25702457e-07, 2.37661734e-06, 2.71065872e-06,
       4.06580669e-06, 4.21743583e-06, 4.62722029e-06, 6.05378519e-06,
       6.32287580e-06, 6.60320490e-06, 8.05179520e-06, 8.32464737e-06,
       9.93244787e-06, 1.17156543e-05, 1.31925514e-05, 1.61741916e-05,
       1.79186545e-05, 2.17040792e-05, 2.20587716e-05, 2.29594757e-05,
       2.33489365e-05, 2.49766481e-05, 2.59708891e-05, 2.61757940e-05,
       2.63634537e-05, 2.64203060e-05, 2.64454619e-05, 2.65590814e-05,
       2.76705290e-05, 2.84390965e-05, 2.88548401e-05, 2.90051257e-05,
       2.92254031e-05, 3.33811974e-05, 3.34976622e-05, 3.43636092e-05,
       3.46404061e-05, 3.65889321e-05, 3.84824966e-05, 3.86909509e-05,
       4.53776871e-05, 4.60149568e-05, 4.72311998e-05, 4.91061249e-05,
       5.12355836e-05, 5.18259897e-05, 5.39634992e-05, 6.39551095e-05,
      

In [19]:
len(Z.columns[np.where(rf.feature_importances_ > 6.60320490e-06)])

81

In [20]:
Z.drop(Z.columns.difference(Z.columns[np.where(rf.feature_importances_ > 6.60320490e-06)]), axis=1, inplace=True)

In [34]:
# test set
Z_train, Z_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(Z, df, data, test_size=0.34, random_seed=42)
Z_train.reset_index(drop=True, inplace=True)
Z_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [24]:
arr = np.array([])
for i in range(len(Z_train)):
    row = Z_train.iloc[i, :].to_numpy().reshape(-1, 1).reshape(9, 9, 1)
    arr = np.append(arr, row)

In [25]:
arr2 = np.array([])
for i in range(len(Z_test)):
    row = Z_test.iloc[i, :].to_numpy().reshape(-1, 1).reshape(9, 9, 1)
    arr2 = np.append(arr2, row)

In [26]:
Z_train_img = arr.reshape(Z_train.shape[0], 9, 9, 1)
Z_test_img = arr2.reshape(Z_test.shape[0], 9, 9, 1)

In [29]:
model.fit(Z_train_img, y_train, batch_size=32, epochs=10, validation_split=0.2)

Train on 34282 samples, validate on 8571 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f50ec0f9400>

In [30]:
Z_test_img_fit = Z_test_img.copy()
Z_test_img_fit[:, 8, 7, 0] = 1 # fitSA
Z_test_img_fit[:, 8, 8, 0] = 1 # fitAC
preds = model.predict(Z_test_img_fit)

In [37]:
X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=0.34, random_seed=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [38]:
p = []
for i in range(len(preds)):
    p.append(preds[i][0])
evaluate(df_test, X_test, data, sent_open_hour_range, p)

(0.16385050962627407, 0.832797281993205, 0.0033522083805209513)