## Helper functions

In [377]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import make_scorer

In [378]:
# Removes from X the specified features
def rm_feature(X, omitted_features):
    X = X.drop(columns=omitted_features)
    return X

In [379]:
def get_categorical_features(X):
    column_types = X.dtypes
    categorical_features = column_types[column_types == 'object'].index.tolist()
    return categorical_features


In [380]:
def one_hot_encoding(X, categorical_features):
    X_one_hot = pd.get_dummies(X, columns=categorical_features,dtype=int)
    return X_one_hot


In [381]:
def rmsle(y, y_pred, **kwargs):
    y_true = np.exp(y) - 1
    y_pred = np.exp(y_pred) - 1
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(np.mean((np.log(1 + y_true) - np.log(1 + y_pred))**2))

In [382]:
def stepwise_feature_selection(X, y, estimator, forward,
                               n_features_to_select = 10):

    score_func = make_scorer(rmsle, greater_is_better=False)
    sfs = SFS(
        estimator,
        k_features = n_features_to_select,
        forward=forward,
        floating=False,
        scoring = score_func,
        cv=10,
        n_jobs=-1
    )
    sfs.fit(X, np.log(y+1))
    return sfs


In [395]:
def k_fold_cv(model, X, y):
    score_func = make_scorer(rmsle)
    scores = cross_val_score(model,
        X,
        np.log(y + 1),
        scoring = score_func,
        cv = 10,
        n_jobs = -1)
    return np.mean(scores)

## Test

In [384]:
train = pd.read_csv("phone_train.csv")
test = pd.read_csv("phone_validation.csv")

# Encode train and test sets
X_train = rm_feature(train, ['y'])
y_train = train['y']
X_test = test

In [385]:
cat_features = get_categorical_features(X_train)
cat_features = cat_features + ['activation.zone', 'activation.channel']
cat_features

['payment.method',
 'sex',
 'vas1',
 'vas2',
 'activation.zone',
 'activation.channel']

In [386]:
X_train['label'] = 'train'
X_test['label'] = 'test'

concat_X = pd.concat([X_train, X_test])
concat_X = one_hot_encoding(concat_X, cat_features + ['label'])

X_train = concat_X[concat_X['label_train']==1]
X_test = concat_X[concat_X['label_test']==1]

X_train = X_train.drop(['label_train','label_test'], axis=1)
X_test = X_test.drop(['label_train','label_test'], axis=1)


In [387]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Columns: 115 entries, tariff.plan to activation.channel_9
dtypes: float64(19), int64(96)
memory usage: 8.9 MB


### Forward stepwise selection

In [388]:
model = LinearRegression()

sfs = stepwise_feature_selection(X_train, y_train, model, True,
                                 n_features_to_select=20)
X_train_sfs = sfs.transform(X_train)

In [389]:
X_train[list(sfs.k_feature_names_)]

Unnamed: 0,tariff.plan,age,q01.out.ch.peak,q01.out.val.offpeak,q04.out.ch.offpeak,q04.out.val.offpeak,q08.out.ch.peak,q08.out.val.offpeak,q09.out.ch.peak,q09.out.val.peak,q09.out.ch.offpeak,q09.in.ch.tot,q09.in.dur.tot,q09.ch.sms,q09.ch.cc,payment.method_Bollettino Postale,sex_B,vas1_N,activation.channel_8,activation.channel_9
0,6,34.29,82,0.0000,0,0.0000,70,0.0,77,31.0132,0,65,8224,0,0,0,0,1,0,0
1,8,22.78,0,0.0000,0,0.0000,23,0.0,16,5.9625,0,32,1154,1,0,0,0,1,0,1
2,7,26.95,244,0.0000,0,0.0000,131,0.0,343,252.4844,0,86,13972,0,4,0,1,0,0,0
3,6,46.09,331,0.0000,0,0.0000,167,0.0,261,113.8906,0,410,51426,0,0,0,1,0,0,0
4,8,23.77,0,0.0000,0,0.0000,0,0.0,0,0.0000,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8,23.01,0,0.0000,0,0.0000,16,0.0,5,1.4172,0,82,6030,4,4,0,0,1,0,0
9996,7,44.72,24,2.3788,12,4.3372,21,0.0,83,32.1288,0,32,2530,0,0,0,1,1,0,0
9997,8,17.47,0,0.0000,0,0.0000,0,0.0,9,4.8129,0,1,7,0,0,0,1,1,0,0
9998,8,21.67,0,0.0000,0,0.0000,24,0.0,32,22.9834,0,30,3976,38,0,0,0,1,0,1


In [399]:
# Test model with subset features
k_fold_cv(LinearRegression(), X_train_sfs, y_train)

2.4394050858977243

In [400]:
# Test model with all features
k_fold_cv(LinearRegression(), X_train, y_train)

2.468890089961442

### Prediction on X_test

In [402]:
X_test_sfs = sfs.transform(X_test)

In [405]:
model = LinearRegression()
model.fit(X_train_sfs, np.log(y_train + 1))
y_pred = np.exp(model.predict(X_test_sfs)) - 1
y_pred = np.maximum(0, y_pred)

In [406]:
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('mySubmission.txt', index=False, header=False, sep='\t')