## Helper functions

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import make_scorer

In [None]:
# Removes from X the specified features
def rm_feature(X, omitted_features):
    X = X.drop(columns=omitted_features)
    return X

In [None]:
def get_categorical_features(X):
    column_types = X.dtypes
    categorical_features = column_types[column_types == 'object'].index.tolist()
    return categorical_features


In [None]:
def one_hot_encoding(X, categorical_features):
    X_one_hot = pd.get_dummies(X, columns=categorical_features,dtype=int)
    return X_one_hot


In [None]:
def rmsle(y, y_pred, **kwargs):
    y_true = np.exp(y) - 1
    y_pred = np.exp(y_pred) - 1
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(np.mean((np.log(1 + y_true) - np.log(1 + y_pred))**2))

In [None]:
def stepwise_feature_selection(X, y, estimator, forward,
                               n_features_to_select = 10):

    score_func = make_scorer(rmsle, greater_is_better=False)
    # score_func = None
    sfs = SFS(
        estimator,
        k_features = n_features_to_select,
        forward=forward,
        floating=False,
        scoring = score_func,
        cv=10,
        n_jobs=-1
    )
    sfs.fit(X, np.log(y+1))
    return sfs


In [None]:
def k_fold_cv(model, X, y):
    score_func = make_scorer(rmsle)
    scores = cross_val_score(model,
        X,
        np.log(y + 1),
        scoring = score_func,
        cv = 10,
        n_jobs = -1)
    return np.mean(scores)

## Test

In [None]:
train = pd.read_csv("phone_train.csv")
test = pd.read_csv("phone_validation.csv")

# Encode train and test sets
X_train = rm_feature(train, ['y'])
y_train = train['y']
X_test = test

In [None]:
cat_features = get_categorical_features(X_train)
cat_features = cat_features + ['activation.zone', 'activation.channel', 'tariff.plan']
cat_features

['payment.method',
 'sex',
 'vas1',
 'vas2',
 'activation.zone',
 'activation.channel',
 'tariff.plan']

In [None]:
X_train['label'] = 'train'
X_test['label'] = 'test'

concat_X = pd.concat([X_train, X_test])
concat_X = one_hot_encoding(concat_X, cat_features + ['label'])

X_train = concat_X[concat_X['label_train']==1]
X_test = concat_X[concat_X['label_test']==1]

X_train = X_train.drop(['label_train','label_test'], axis=1)
X_test = X_test.drop(['label_train','label_test'], axis=1)


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Columns: 119 entries, age to tariff.plan_8
dtypes: float64(19), int64(100)
memory usage: 9.2 MB


### Forward stepwise selection

In [None]:
model = LinearRegression()

sfs = stepwise_feature_selection(X_train, y_train, model, True,
                                 n_features_to_select=30)
X_train_sfs = sfs.transform(X_train)

In [None]:
X_train[list(sfs.k_feature_names_)]

Unnamed: 0,age,q01.out.ch.peak,q01.out.val.offpeak,q01.in.dur.tot,q02.out.val.peak,q04.out.ch.offpeak,q04.out.val.offpeak,q04.ch.sms,q05.in.ch.tot,q07.in.dur.tot,...,q09.ch.cc,payment.method_Bollettino Postale,sex_B,vas1_N,activation.zone_2,activation.channel_8,activation.channel_9,tariff.plan_3,tariff.plan_4,tariff.plan_8
0,34.29,82,0.0000,4154,29.6808,0,0.0000,0,117,9168,...,0,0,0,1,0,0,0,0,0,0
1,22.78,0,0.0000,0,0.0000,0,0.0000,3,42,4549,...,0,0,0,1,1,0,1,0,0,1
2,26.95,244,0.0000,16351,332.0048,0,0.0000,0,211,17145,...,4,0,1,0,0,0,0,0,0,0
3,46.09,331,0.0000,41517,125.3498,0,0.0000,0,440,39850,...,0,0,1,0,1,0,0,0,0,0
4,23.77,0,0.0000,0,0.0000,0,0.0000,0,0,0,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,23.01,0,0.0000,0,0.0000,0,0.0000,3,45,7047,...,4,0,0,1,1,0,0,0,0,1
9996,44.72,24,2.3788,1113,6.0322,12,4.3372,0,24,939,...,0,0,1,1,1,0,0,0,0,0
9997,17.47,0,0.0000,0,0.0000,0,0.0000,0,0,0,...,0,0,1,1,0,0,0,0,0,1
9998,21.67,0,0.0000,0,0.0000,0,0.0000,0,97,6572,...,0,0,0,1,1,0,1,0,0,1


In [None]:
# Test model with subset features
k_fold_cv(LinearRegression(), X_train_sfs, y_train)

2.4346394132516265

In [None]:
# Test model with all features
k_fold_cv(LinearRegression(), X_train, y_train)

2.4643647357926572

### Prediction on X_test

In [None]:
X_test_sfs = sfs.transform(X_test)

In [None]:
model = LinearRegression()
model.fit(X_train_sfs, np.log(y_train + 1))
y_pred = np.exp(model.predict(X_test_sfs)) - 1
y_pred = np.maximum(0, y_pred)

In [None]:
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('mySubmission.txt', index=False, header=False, sep='\t')