In [1]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dropout
from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from matplotlib import pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import math
import os
import tensorflow as tf
import time
import random
import warnings
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge, Lars
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.kernel_ridge import KernelRidge
warnings.filterwarnings('ignore')

Using TensorFlow backend.


## Load data

In [39]:
data = pd.read_csv("s3://ai-diennea/data/export_wonkit_20210630102441.csv.gz")
# convert datetime format
data['EVENT.DATE'] = pd.to_datetime(data['EVENT.DATE'], format='%Y/%m/%d %H:%M')
# add the day of the week column
day_of_week = []
for i in range(len(data)):
    day_of_week.append(data["EVENT.DATE"][i].day_name())
data['day_of_week'] = pd.DataFrame(day_of_week)

In [174]:
sent_open_hour_range = 36
open_click_hour_range = 24

def exp_decay_fit_eval(x, sent_open_hour_range):
    '''Return a value from 0 to 1 following an exponential decreasing function.'''
    if x < 0:
        x = (60*24)+x
    if x > sent_open_hour_range*60:
        return .0
    return math.exp(90*((-math.log(2)/(sent_open_hour_range*60))*x) + math.log(2))/2

def get_all_indexes(hash_mex, hash_contact, data):
    '''Given the hash message, hash contact and raw data, return the indexes
    for the hash message and hash contact in the raw data.ù'''
    return data.index[(data['HashMessaggio'] == hash_mex) & (data['HashContatto'] == hash_contact)]


def from_min_to_hour_and_min(mins):
    '''Given the minutes, return a string that represents the hour and minutes.'''
    hours = int(round(mins)) // 60
    minutes = int(round(mins)) % 60
    return "{}:{}".format(hours, minutes)

    
def compute_fitSA_evaluation(hash_mex, hash_contact, sent_open_hour_range, data, sent_pred, never_opened=True):
    '''Given the hash message, hash contact, raw data and the predicted sent hour,
    return the fitSA for that contact and message.'''
    sent_pred = from_min_to_hour_and_min(sent_pred)
    df2 = data[(data['HashMessaggio'] == hash_mex)]
    df3 = df2[(df2['HashContatto'] == hash_contact)]
    df4 = df3[(df3['EVENT.TYPE'] == 'Open')]
    opens = list(df4['EVENT.DATE'])
    df5 = df3[(df3['EVENT.TYPE'] == 'Click')]
    clicks = list(df5['EVENT.DATE'])
    df6 = df3[(df3['EVENT.TYPE'] == 'Sent')]
    sents = list(df6['EVENT.DATE'])
    
    oldest = None
    if opens != []:
        for i in opens:
            if oldest is None:
                oldest = i
            elif i < oldest:
                oldest = i
    elif clicks != []:
        for i in clicks:
            if oldest is None:
                oldest = i
            elif i < oldest:
                oldest = i
    else: # this means that the mail has never been opened/clicked
        if never_opened == True:
            return 0
        else:
            return -1
    giorni_di_differenza_tra_real_open_and_real_sent = (oldest.year*12*30 + oldest.month*30 + oldest.day) - (sents[0].year*12*30 + sents[0].month*30 + sents[0].day)
    oldest = str(oldest.hour) +":"+ str(oldest.minute)
    oldest = pd.to_datetime(oldest, format='%H:%M')
    sent_pred = pd.to_datetime(sent_pred, format='%H:%M')
    # compute minutes of the distance between sent-open/sent-click
    mins = ((oldest - sent_pred).seconds//3600)*60 + ((oldest - sent_pred).seconds//60)%60
    if giorni_di_differenza_tra_real_open_and_real_sent > 0:
        mins = mins + (giorni_di_differenza_tra_real_open_and_real_sent*60*24)
    return exp_decay_fit_eval(mins, sent_open_hour_range)

def compute_fitSC_evaluation(hash_mex, hash_contact, open_click_hour_range, data, sent_pred, never_opened=True):
    '''Given the message, the hash contact and the data (raw data), return the fitAC
    for that hash message and hash contact.'''
    sent_pred = from_min_to_hour_and_min(sent_pred)
    df2 = data[(data['HashMessaggio'] == hash_mex)]
    df3 = df2[(df2['HashContatto'] == hash_contact)]
    df4 = df3[(df3['EVENT.TYPE'] == 'Click')]
    clicks = list(df4['EVENT.DATE'])
    df5 = df3[(df3['EVENT.TYPE'] == 'Sent')]
    sents = list(df5['EVENT.DATE'])
    
    oldest_click = None

    if clicks == []: # covers cases when a message is sent and is never open and never clicked
        if never_opened == True:
            return 0
        else:
            return -1
    else:
        for i in clicks: # get oldest click
            if oldest_click is None:
                oldest_click = i
            elif i < oldest_click:
                oldest_click = i
    giorni_di_differenza_tra_real_click_and_real_sent = (oldest_click.year*12*30 + oldest_click.month*30 + oldest_click.day) - (sents[0].year*12*30 + sents[0].month*30 + sents[0].day)
    # compute minutes of the distance between sent pred-click
    oldest_click = str(oldest_click.hour) +":"+ str(oldest_click.minute)
    #print(oldest_click)
    oldest_click = pd.to_datetime(oldest_click, format='%H:%M')
    sent_pred = pd.to_datetime(sent_pred, format='%H:%M')
    # compute minutes of the distance between sent-open/sent-click
    mins = ((oldest_click - sent_pred).seconds//3600)*60 + ((oldest_click - sent_pred).seconds//60)%60
    if giorni_di_differenza_tra_real_click_and_real_sent > 0:
        mins = mins + (giorni_di_differenza_tra_real_click_and_real_sent*60*24)
    return exp_decay_fit_eval(mins, sent_open_hour_range)

def split_train_test_by_lifetime(X, df, data, test_size, random_seed):
    d7 = {}
    for i in data["HashContatto"].unique():
        if i not in d7:
            d7[i] = []
    for i in range(len(data)):
        if "nan" not in str(data["EVENT.DATE"][i]) and (str(data["EVENT.TYPE"][i]) == 'Open' or str(data["EVENT.TYPE"][i]) == 'Click'):
            d7[data["HashContatto"][i]].append(data["EVENT.DATE"][i])
    # Here I merge in the same category (assign 0 days) who never opened with the users that opened just once
    for i in data["HashContatto"].unique():
        if len(d7[i]) == 0 or len(d7[i]) == 1:
            d7[i] = 0 # 0 days as lifetime
        else:
        # retain newest and oldest date
            newest_date = d7[i][0] # get the first date
            oldest_date = d7[i][0] # get the first date
            for j in d7[i]:
                if j > newest_date:
                    newest_date = j
                if j < oldest_date:
                    oldest_date = j
            # assign the lifetitme for the contact i
            d7[i] = (newest_date - oldest_date).days
    df['Lifetime'] = 0
    for i in range(len(df)):
        df.loc[i, 'Lifetime'] = d7[df['HashContatto'][i]]
    lt = df['Lifetime'].to_numpy()
    zero = []
    today = []
    between = []
    for i in range(len(lt)):
        if lt[i] == 0:
            zero.append(i)
        elif lt[i] >= 320: # TODO: here I assume that a user still open today whether his lifetime is greater than or equal than 320 (it means that the last time he opened is 1 month ago)
            today.append(i)
        else:
            between.append(i)
            
    random.Random(random_seed).shuffle(zero) # 39%
    random.Random(random_seed).shuffle(today) # 17%
    random.Random(random_seed).shuffle(between) # 43%
    
    percentage_zero_train = round(len(zero) - (test_size * len(zero)))
    percentage_today_train = round(len(today) - (test_size * len(today)))
    percentage_between_train = round(len(between) - (test_size * len(between)))
    
    train_indexes_zero = zero[:percentage_zero_train]
    test_indexes_zero = zero[percentage_zero_train:]

    train_indexes_today = today[:percentage_today_train]
    test_indexes_today = today[percentage_today_train:]

    train_indexes_between = between[:percentage_between_train]
    test_indexes_between = between[percentage_between_train:]
    
    X_train = X.iloc[train_indexes_zero + train_indexes_today + train_indexes_between, :]
    y_train = df.iloc[train_indexes_zero + train_indexes_today + train_indexes_between, :]['Label']
    X_test = X.iloc[test_indexes_zero + test_indexes_today + test_indexes_between, :]
    y_test = df.iloc[test_indexes_zero + test_indexes_today + test_indexes_between, :]['Label']
    
    df_train = df.iloc[train_indexes_zero + train_indexes_today + train_indexes_between, :]
    df_test = df.iloc[test_indexes_zero + test_indexes_today + test_indexes_between, :]
    return X_train, X_test, y_train, y_test, df_train, df_test    


def count_mins(df, i, preds):
    sent_gt = from_min_to_hour_and_min(df['Label'][i]*24*60)
    sent_pred2 = from_min_to_hour_and_min(preds[i])
    sent_gt = pd.to_datetime(sent_gt, format='%H:%M')
    sent_pred2 = pd.to_datetime(sent_pred2, format='%H:%M')
    mins = ((sent_pred2 - sent_gt).days*24*60) + ((sent_pred2 - sent_gt).seconds//3600)*60 + ((sent_pred2 - sent_gt).seconds//60)%60
    return mins 
def evaluate(df, X, data, sent_open_hour_range, preds, never_opened=True):
    '''Given the prediction of the model "preds" returns:
    - how many fitSA are better than the ground truth fitSA
    - how many fitSA are equal than the ground truth fitSA
    - how many fitSA are worse than the ground truth fitSA.
    If never_opened == True, then consider also the sent mails that have never been opened. Not otherwise.
    '''
    # from [0, 1] to mins
    for i in range(len(preds)):
        preds[i] *= 24*60 # this 24*60 is requierd to convert mins from [0, 1] into real mins
    # get fitSA using the predicted sent
    fitSA_preds = []
    fitSC_preds = []
    for i in range(len(df)):
        fitSA_preds.append(compute_fitSA_evaluation(df['HashMessaggio'][i], df['HashContatto'][i], sent_open_hour_range, data, preds[i], never_opened=never_opened))
        fitSC_preds.append(compute_fitSC_evaluation(df['HashMessaggio'][i], df['HashContatto'][i], sent_open_hour_range, data, preds[i], never_opened=never_opened))
    
    total_mex = len(fitSA_preds)
    predicted_sent_better_usual_sent = 0
    predicted_sent_equal_usual_sent = 0
    predicted_sent_worst_usual_sent = 0
    
    mins_better = []
    mins_worst = []
    
    if never_opened == True:
        for i in range(total_mex):
            if fitSA_preds[i] > X.loc[i, 'fitSA']:
                predicted_sent_better_usual_sent += 1
                mins_better.append(count_mins(df, i, preds))
            elif fitSA_preds[i] == X.loc[i, 'fitSA']:
                predicted_sent_equal_usual_sent += 1
            elif fitSC_preds[i] > X.loc[i, 'fitAC']:
                predicted_sent_better_usual_sent += 1
                mins_better.append(count_mins(df, i, preds))
            elif fitSC_preds[i] == X.loc[i, 'fitAC']:
                predicted_sent_equal_usual_sent += 1
            else:
                predicted_sent_worst_usual_sent += 1 # case fitSC predetta < fitAC reale
                mins_worst.append(count_mins(df, i, preds))
        return predicted_sent_better_usual_sent/total_mex, predicted_sent_equal_usual_sent/total_mex, predicted_sent_worst_usual_sent/total_mex, [sum(mins_better)/len(mins_better), sum(mins_worst)/len(mins_worst)]
    else:
        total_mex_different_from_minus_one = len([i for i in fitSA_preds if i > -1])
        total_mex_different_from_minus_one2 = len([i for i in fitSC_preds if i > -1])
        print(total_mex_different_from_minus_one)
        print(total_mex_different_from_minus_one2)
        for i in range(total_mex):
            if fitSA_preds[i] != -1 and fitSA_preds[i] > X.loc[i, 'fitSA']:
                predicted_sent_better_usual_sent += 1
                mins_better.append(count_mins(df, i, preds))
                print("1")
            elif fitSA_preds[i] != -1 and fitSA_preds[i] == X.loc[i, 'fitSA']:
                print("2")
                predicted_sent_equal_usual_sent += 1
            elif fitSC_preds[i] != -1 and fitSC_preds[i] > X.loc[i, 'fitAC']:
                print("3")
                predicted_sent_better_usual_sent += 1
                mins_better.append(count_mins(df, i, preds))
            elif fitSC_preds[i] != -1 and fitSC_preds[i] == X.loc[i, 'fitAC']:
                print("4")
                predicted_sent_equal_usual_sent += 1
            elif (fitSA_preds[i] != -1 and fitSA_preds[i] < X.loc[i, 'fitSA']) and (fitSC_preds[i] != -1 and fitSC_preds[i] < X.loc[i, 'fitSC']):
                predicted_sent_worst_usual_sent += 1# TODO: creare istogramma dove discretizzi per minuto (raggruppo) e faccio media delle percentuali (tra sent e open, provare a vedere cosa acccade anche tra open e click)
                mins_worst.append(count_mins(df, i, preds))
                print("5")
            print(mins_better)
            print(mins_worst)
            return predicted_sent_better_usual_sent/total_mex_different_from_minus_one, predicted_sent_equal_usual_sent/total_mex_different_from_minus_one, predicted_sent_worst_usual_sent/total_mex_different_from_minus_one, [sum(mins_better)/len(mins_better), sum(mins_worst)/len(mins_worst)]

## Algoritmi di learning

## Least Angle Regression

In [170]:
df = pd.read_csv("s3://ai-diennea/data/df_90.csv")
df = df.drop(['Unnamed: 0'], axis=1)
X = pd.read_csv("s3://ai-diennea/data/X_90.csv")
X = X.drop(['Unnamed: 0'], axis=1)
y = pd.read_csv("s3://ai-diennea/data/y_90.csv")
y = y.drop(['Unnamed: 0'], axis=1)

In [176]:
for i in [0.2]:    
    # train set
    df_train, df_test = train_test_split(df, test_size=i, random_state=42)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    # test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    model = Lars(n_nonzero_coefs=1)
    model.fit(X_train, y_train)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds))

(0.058909594948405974, 0.9252271677190821, 0.015863237332511937, [43.169934640522875, -12.339805825242719])


In [175]:
for i in [0.2]:    
    # train set
    df_train, df_test = train_test_split(df, test_size=i, random_state=42)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    # test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    model = Lars(n_nonzero_coefs=1)
    model.fit(X_train, y_train)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds, never_opened=False))

3060
334
[]
[]


ZeroDivisionError: division by zero

In [177]:
# TODO: with 90 non entra in nessuna funzione.. forse troppo? to check

In [18]:
for i in [0.2, 0.3, 0.34]:    
    # train set
    df_train, df_test = train_test_split(df, test_size=i, random_state=42)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    # test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    model = Lars(n_nonzero_coefs=1)
    model.fit(X_train, y_train)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds))

[-193, -108, 4, 37, 71, -242, 71, 36, 44, -137, 25, 71, -109, 21, -137, 65, 71, 4, 123, -137, 64, -135, 67, 68, 67, 38, -135, 245, -137, 65, -137, 45, 71, -135, 21, 245, 38, 38, 4, -137, -137, 245, 25, 68, -135, 21, 4, -137, 38, 68, -137, 69, -137, 25, 44, 68, 65, 4, 67, 68, 69, -193, 44, -192, 4, -121, -137, 68, -193, 67, 123, 71, 67, 67, 69, 67, 71, -137, 123, 4, -192, -137, 25, 38, 44, 45, 4, 21, 25, 38, -137, 4, -137, 4, 25, 245, -137, 68, 4, -242, -193, 68, 67, -109, -241, -137, 67, -108, -141, 25, -137, -135, -109, 45, 71, 69, -109, 69, 65, -137, 68, 69, 71, 71, 25, 25, 67, 21, -140, 68, -121, 38, 38, 44, 71, 69, -141, 4, 71, 245, 21, -193, 67, -121, -109, 68, 65, -193, -242, 71, 71, 65, -135, 69, 25, 70, 68, 71, 68, -137, 21, 69, 68, -137, -109, 69, -140, 65, -121, 34, 69, 21, 4, -109, -241, -243, -141, 65, 21, 45, -140, -121, -109, 71, 25, 69, 21, -109, 69, 25, 25, -141, 123, 21, 25, 32, 21, 69, 68, 38, 71, 4, 38, 71, 69, 21, 45, -137, 68, 65, 68, -135, 71, 71, 67, -193, 68, 21

KeyboardInterrupt: 

In [6]:
for i in [0.2, 0.3, 0.34]:    
    # train set
    df_train, df_test = train_test_split(df, test_size=i, random_state=42)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    # test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    model = Lars(n_nonzero_coefs=1)
    model.fit(X_train, y_train)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds, never_opened=False))

(0.7324270557029178, 0.07725464190981432, 0.1903183023872679)
(0.7330406147091109, 0.07618002195389682, 0.1907793633369923)
(0.7336673732896511, 0.07670071304682984, 0.18963191366351898)


### k-fold

In [7]:
M = X.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    model = Lars(n_nonzero_coefs=1)
    model.fit(M_train, g_train)

    M_test_fit = M_test.copy()
    M_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    M_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(M_test_fit)
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, preds))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

(0.17553291625749504, 0.8221260370977399, 0.002341046644764976)

### k-fold with never_opened = True

In [8]:
M = X.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    model = Lars(n_nonzero_coefs=1)
    model.fit(M_train, g_train)

    M_test_fit = M_test.copy()
    M_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    M_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(M_test_fit)
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, preds, never_opened=False))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

(0.7399916349564156, 0.07358961086786106, 0.1864187541757233)

### Lifetime

In [9]:
for i in [0.20, 0.30, 0.34]:
    X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=i, random_seed=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    
    model = model = Lars(n_nonzero_coefs=1)
    model.fit(X_train, y_train)
    
    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds))

(0.17010626828892653, 0.8272755274911443, 0.0026182042199291546)
(0.1746496226705683, 0.8229888597977308, 0.002361517531700806)
(0.17549263873159682, 0.8221064552661381, 0.002400906002265006)


In [10]:
for i in [0.20, 0.30, 0.34]:
    X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=i, random_seed=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    
    model = model = Lars(n_nonzero_coefs=1)
    model.fit(X_train, y_train)
    
    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = model.predict(X_test_fit)

    print(evaluate(df_test, X_test, data, sent_open_hour_range, preds, never_opened=False))

(0.734375, 0.07613031914893617, 0.18949468085106383)
(0.7411764705882353, 0.07494553376906318, 0.18387799564270152)
(0.7434273651890232, 0.0734983688351564, 0.18307426597582038)


## Neural Networks

In [7]:
# train set
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
# test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [8]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               12672     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 177,537
Trainable params: 177,537
Non-trainable params: 0
_________________________________________________________________


In [9]:
def reset_random_seeds(seed):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [14]:
reset_random_seeds(13)
for i in [0.2, 0.3, 0.34]:
    # train set
    df_train, df_test = train_test_split(df, test_size=i, random_state=42)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    # test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    NN_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split = 0.3)

    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = NN_model.predict(X_test_fit)

    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    print(evaluate(df_test, X_test, data, sent_open_hour_range, p))

Train on 36359 samples, validate on 15583 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.19929154474048977, 0.7987062990913292, 0.002002156168181118, [14.193972179289027, 6.846153846153846])
Train on 31814 samples, validate on 13635 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.20873761486729298, 0.790338313055085, 0.0009240720776220545, [-0.8691588785046729, -15.11111111111111])
Train on 29996 samples, validate on 12856 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.2090052545751042, 0.7897716977713354, 0.0012230476535604277, [-1.2518422193324663, -3.888888888888889])


### k-fold

In [22]:
M = X.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    model = NN_model
    model.fit(M_train, g_train, epochs=10, batch_size=32, validation_split = 0.2)

    M_test_fit = M_test.copy()
    M_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    M_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    
    preds = model.predict(M_test_fit)
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, p))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41554 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 41554 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


(0.08269224503554404, 0.9098226311787091, 0.0074851237857469334)

### Lifetime

In [15]:
for i in [0.2, 0.3, 0.34]:
    X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=i, random_seed=42)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_test.reset_index(drop=True, inplace=True)

    model = NN_model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split = 0.2)


    X_test_fit = X_test.copy()
    X_test_fit.loc[:, 'fitSA'] = 1 # fitSA
    X_test_fit.loc[:, 'fitAC'] = 1 # fitAC
    preds = NN_model.predict(X_test_fit)

    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    print(evaluate(df_test, X_test, data, sent_open_hour_range, p))

Train on 41553 samples, validate on 10389 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.2065301093485292, 0.7925458185738488, 0.0009240720776220545, [2.6547352721849364, -2.3333333333333335])
Train on 36359 samples, validate on 9090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.20812156681554494, 0.7910570357821244, 0.0008213974023307152, [-0.6329551060680809, 1.1875])
Train on 34282 samples, validate on 8571 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
(0.20634201585503964, 0.7922536806342015, 0.0014043035107587768, [-6.715477497255763, -5.32258064516129])


## Regression with CNNs

In [141]:
from PIL import Image
import os, glob, sys, numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout
from keras.layers import Activation, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers
from keras import losses
from keras import backend as K 
import matplotlib.pyplot as plt
import math
from keras.optimizers import SGD, Adam
from keras import metrics
from keras import models, layers, optimizers 

In [156]:
Z = X.copy()

In [157]:
Z['fitSA_copy'] = Z['fitSA']
Z['fitAC_copy'] = Z['fitAC']

In [158]:
droprate=0.25

model = Sequential()
model.add(Conv2D(32, (3,3), padding="same", input_shape=(10, 10, 1), activation="relu")) 
model.add(BatchNormalization())
model.add(Dropout(droprate))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(32, (3,3), padding="same", activation="relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(droprate))

model.add(Conv2D(64, (3,3), padding="same", activation="relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(droprate))

model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(BatchNormalization())
model.add(Dense(1))

def rmsle(y_test, y_pred):
    return K.sqrt(K.mean(K.square(K.log(y_pred) - K.log(y_test))))

model.compile(loss= 'mean_squared_error', optimizer='adam', metrics=[rmsle])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_25 (Conv2D)           (None, 10, 10, 32)        320       
_________________________________________________________________
batch_normalization_33 (Batc (None, 10, 10, 32)        128       
_________________________________________________________________
dropout_25 (Dropout)         (None, 10, 10, 32)        0         
_________________________________________________________________
max_pooling2d_25 (MaxPooling (None, 5, 5, 32)          0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 5, 5, 32)          9248      
_________________________________________________________________
batch_normalization_34 (Batc (None, 5, 5, 32)          128       
_________________________________________________________________
max_pooling2d_26 (MaxPooling (None, 2, 2, 32)         

In [159]:
# train set
df_train, df_test = train_test_split(df, test_size=0.33, random_state=42)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
# test set
Z_train, Z_test, y_train, y_test = train_test_split(Z, y, test_size=0.33, random_state=42)
Z_train.reset_index(drop=True, inplace=True)
Z_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# test set
X_train, X_test, g_train, g_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)


In [160]:
arr = np.array([])
for i in range(len(Z_train)):
    row = Z_train.iloc[i, :].to_numpy().reshape(-1, 1).reshape(10, 10, 1)
    arr = np.append(arr, row)

In [161]:
arr2 = np.array([])
for i in range(len(Z_test)):
    row = Z_test.iloc[i, :].to_numpy().reshape(-1, 1).reshape(10, 10, 1)
    arr2 = np.append(arr2, row)

In [162]:
Z_train_img = arr.reshape(Z_train.shape[0], 10, 10, 1)
Z_test_img = arr2.reshape(Z_test.shape[0], 10, 10, 1)

In [163]:
model.fit(Z_train_img, y_train, batch_size=64, epochs=10, validation_split=0.3)

Train on 30450 samples, validate on 13051 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7ff7f58def28>

In [164]:
Z_test_img_fit = Z_test_img.copy()
Z_test_img_fit[:, 9, 8, 0] = 1 # fitSA
Z_test_img_fit[:, 9, 9, 0] = 1 # fitAC
preds = model.predict(Z_test_img_fit)

In [165]:
preds

array([[0.6783531 ],
       [0.62115383],
       [0.4813221 ],
       ...,
       [0.4718498 ],
       [0.63246495],
       [0.64637977]], dtype=float32)

In [166]:
p = []
for i in range(len(preds)):
    p.append(preds[i][0])
evaluate(df_test, X_test, data, sent_open_hour_range, p)

(0.1385634946562748, 0.8544359919727447, 0.007000513370980539)

**never_opened=False**

In [167]:
p = []
for i in range(len(preds)):
    p.append(preds[i][0])
evaluate(df_test, X_test, data, sent_open_hour_range, p, never_opened=False)

(0.5866429559375618, 0.06915629322268327, 0.344200750839755)

### k-fold

In [169]:
M = Z.copy()
g = y.copy()
res = []
kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(M):
    M_train, M_test = M.iloc[train_index], M.iloc[test_index]
    g_train, g_test = g.iloc[train_index], g.iloc[test_index]
    
    arr = np.array([])
    for i in range(len(M_train)):
        row = M_train.iloc[i, :].to_numpy().reshape(-1, 1).reshape(10, 10, 1)
        arr = np.append(arr, row)
    
    arr2 = np.array([])
    for i in range(len(M_test)):
        row = M_test.iloc[i, :].to_numpy().reshape(-1, 1).reshape(10, 10, 1)
        arr2 = np.append(arr2, row)
    
    M_train_img = arr.reshape(M_train.shape[0], 10, 10, 1)
    M_test_img = arr2.reshape(M_test.shape[0], 10, 10, 1)
    
    model.fit(M_train_img, g_train, batch_size=64, epochs=10, validation_split=0.3)
    
    M_test_img_fit = M_test_img.copy()
    M_test_img_fit[:, 9, 8, 0] = 1 # fitSA
    M_test_img_fit[:, 9, 9, 0] = 1 # fitAC
    preds = model.predict(M_test_img_fit)
    
    p = []
    for i in range(len(preds)):
        p.append(preds[i][0])
    
    my_df = df.iloc[test_index]
    my_df.reset_index(drop=True, inplace=True)
    M_test.reset_index(drop=True, inplace=True)
    res.append(evaluate(my_df, M_test, data, sent_open_hour_range, p, never_opened=False))
better = 0
equal = 0
worst = 0
for i in range(5):
    better += res[i][0]
    equal += res[i][1]
    worst += res[i][2]
better/5, equal/5, worst/5

Train on 36359 samples, validate on 15583 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


ValueError: time data '24:31' does not match format '%H:%M' (match)

### Lifetime

In [197]:
Z = X.copy()
Z['fitSA_copy'] = Z['fitSA']
Z['fitAC_copy'] = Z['fitAC']

In [198]:
# test set
Z_train, Z_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(Z, df, data, test_size=0.34, random_seed=42)
Z_train.reset_index(drop=True, inplace=True)
Z_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [199]:
arr = np.array([])
for i in range(len(Z_train)):
    row = Z_train.iloc[i, :].to_numpy().reshape(-1, 1).reshape(10, 10, 1)
    arr = np.append(arr, row)

In [200]:
arr2 = np.array([])
for i in range(len(Z_test)):
    row = Z_test.iloc[i, :].to_numpy().reshape(-1, 1).reshape(10, 10, 1)
    arr2 = np.append(arr2, row)

In [201]:
Z_train_img = arr.reshape(Z_train.shape[0], 10, 10, 1)
Z_test_img = arr2.reshape(Z_test.shape[0], 10, 10, 1)

In [202]:
model.fit(Z_train_img, y_train, batch_size=64, epochs=10, validation_split=0.3)

Train on 29997 samples, validate on 12856 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7ff7dbffad68>

In [203]:
Z_test_img_fit = Z_test_img.copy()
Z_test_img_fit[:, 9, 8, 0] = 1 # fitSA
Z_test_img_fit[:, 9, 9, 0] = 1 # fitAC
preds = model.predict(Z_test_img_fit)

In [204]:
X_train, X_test, y_train, y_test, df_train, df_test = split_train_test_by_lifetime(X, df, data, test_size=0.34, random_seed=42)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [205]:
p = []
for i in range(len(preds)):
    p.append(preds[i][0])
evaluate(df_test, X_test, data, sent_open_hour_range, p)

(0.11823329558323896, 0.8774178935447339, 0.00434881087202718)

**never_opened=False**

In [206]:
p = []
for i in range(len(preds)):
    p.append(preds[i][0])
evaluate(df_test, X_test, data, sent_open_hour_range, p, never_opened=False)

(0.5006713984270094, 0.12142720122769998, 0.37790140034529063)

In [None]:
# TODO: vedere se altri modelli tipo cnn danno orari molto diversi tra loro o se smili come least angle regr