## Helper functions

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold

In [None]:
# Removes from X the specified features
def rm_feature(X, omitted_features):
    X = X.drop(columns=omitted_features)
    return X

In [None]:
def get_categorical_features(X):
    column_types = X.dtypes
    categorical_features = column_types[column_types == 'object'].index.tolist()
    return categorical_features


In [None]:
def one_hot_encoding(X, categorical_features):
    X_one_hot = pd.get_dummies(X, columns=categorical_features,dtype=int)
    return X_one_hot


In [None]:
class FeatureSelector():
    def __init__(self, cv_size = 10, take_logs = True):
        self.k_fold_splitter = KFold(n_splits=cv_size, random_state=1,
                                     shuffle=True)

        self.take_logs = take_logs


    def rmsle(self, y, y_pred, **kwargs):
        y_true = y
        if self.take_logs:
            y_true = np.exp(y) - 1
            y_pred = np.exp(y_pred) - 1
        y_pred = np.maximum(0, y_pred)
        return np.sqrt(np.mean((np.log(1 + y_true) - np.log(1 + y_pred))**2))

    def stepwise_feature_selection(self, X, y, estimator, forward,
                                   n_features_to_select = 10,
                                   score_function = 'auto'):

        if score_function == 'auto':
            score_func = make_scorer(self.rmsle, greater_is_better=False)
        else:
            score_func = score_function

        sfs = SFS(
            estimator,
            k_features = n_features_to_select,
            forward=forward,
            floating=False,
            scoring = score_func,
            cv=self.k_fold_splitter,
            n_jobs=-1
        )
        y_to_fit = np.log(y+1) if self.take_logs else y
        sfs.fit(X, y_to_fit)
        return sfs

    def k_fold_cv(self, model, X, y):
        score_function = make_scorer(self.rmsle)
        y_to_fit = np.log(y+1) if self.take_logs else y
        scores = cross_val_score(model,
            X,
            y_to_fit,
            scoring = score_function,
            cv = self.k_fold_splitter,
            n_jobs = -1)
        return np.mean(scores)


    def select_best_subset(self, X, y, estimator, forward,
                           n_features_to_select = 10,
                           score_function = 'auto'):
        sfs = self.stepwise_feature_selection(X, y,
                                              estimator,
                                              forward,
                                              n_features_to_select,
                                              score_function)
        subsets = sfs.subsets_
        best_subset = max(subsets.values(), key = lambda x: x['avg_score'])
        return list(best_subset['feature_idx'])



## Test

In [None]:
train = pd.read_csv("phone_train.csv")
test = pd.read_csv("phone_validation.csv")

# Encode train and test sets
X_train = rm_feature(train, ['y'])
y_train = train['y']
X_test = test

In [None]:
cat_features = get_categorical_features(X_train)
cat_features = cat_features + ['activation.zone', 'activation.channel',
                               'tariff.plan']
cat_features

['payment.method',
 'sex',
 'vas1',
 'vas2',
 'activation.zone',
 'activation.channel',
 'tariff.plan']

In [None]:
X_train['label'] = 'train'
X_test['label'] = 'test'

concat_X = pd.concat([X_train, X_test])
concat_X = one_hot_encoding(concat_X, cat_features + ['label'])

X_train = concat_X[concat_X['label_train']==1]
X_test = concat_X[concat_X['label_test']==1]

X_train = X_train.drop(['label_train','label_test'], axis=1)
X_test = X_test.drop(['label_train','label_test'], axis=1)


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Columns: 119 entries, age to tariff.plan_8
dtypes: float64(19), int64(100)
memory usage: 9.2 MB


### Forward stepwise selection

In [None]:
model = LinearRegression()
feature_selector = FeatureSelector()
sfs = feature_selector.stepwise_feature_selection(X_train, y_train, model, True,
                                 n_features_to_select="best")
X_train_sfs = sfs.transform(X_train)

In [None]:
X_train[list(sfs.k_feature_names_)]

Unnamed: 0,age,q01.out.ch.peak,q01.out.ch.offpeak,q01.out.val.offpeak,q01.in.dur.tot,q01.ch.sms,q02.out.ch.peak,q02.ch.sms,q03.out.dur.offpeak,q03.in.ch.tot,...,vas1_Y,vas2_N,vas2_Y,activation.zone_0,activation.zone_2,activation.channel_8,activation.channel_9,tariff.plan_3,tariff.plan_4,tariff.plan_8
0,34.29,82,0,0.0000,4154,6,70,0,0,78,...,0,1,0,0,0,0,0,0,0,0
1,22.78,0,0,0.0000,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,1
2,26.95,244,0,0.0000,16351,0,392,2,0,263,...,1,1,0,0,0,0,0,0,0,0
3,46.09,331,0,0.0000,41517,0,303,0,0,499,...,1,1,0,0,1,0,0,0,0,0
4,23.77,0,0,0.0000,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,23.01,0,0,0.0000,0,0,0,0,0,13,...,0,1,0,0,1,0,0,0,0,1
9996,44.72,24,6,2.3788,1113,0,13,0,153,17,...,0,1,0,0,1,0,0,0,0,0
9997,17.47,0,0,0.0000,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
9998,21.67,0,0,0.0000,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,1


In [None]:
# Test model with subset features
feature_selector.k_fold_cv(LinearRegression(), X_train_sfs, y_train)

2.432767864867283

In [None]:
# Test model with all features
feature_selector.k_fold_cv(LinearRegression(), X_train, y_train)

2.463238898092651

### Prediction on X_test

In [None]:
X_test_sfs = sfs.transform(X_test)

In [None]:
model = LinearRegression()
model.fit(X_train_sfs, np.log(y_train + 1))
y_pred = np.exp(model.predict(X_test_sfs)) - 1
y_pred = np.maximum(0, y_pred)

In [None]:
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('mySubmission.txt', index=False, header=False, sep='\t')

## KNN regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor as KNN

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', KNN(n_neighbors = 10))])

feature_selector = FeatureSelector()

sfs_knn = feature_selector.stepwise_feature_selection(X_train, y_train,
                                                      pipe, True,
                                                      n_features_to_select=10)

In [None]:
X_train[list(sfs_knn.k_feature_names_)]

Unnamed: 0,q08.out.ch.peak,q09.out.ch.peak,q09.out.ch.offpeak,q09.out.dur.offpeak,q09.out.val.offpeak,q09.ch.sms,vas2_N,vas2_Y,activation.zone_0,tariff.plan_3
0,70,77,0,0,0.0,0,1,0,0,0
1,23,16,0,0,0.0,1,1,0,0,0
2,131,343,0,0,0.0,0,1,0,0,0
3,167,261,0,0,0.0,0,1,0,0,0
4,0,0,0,0,0.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,16,5,0,0,0.0,4,1,0,0,0
9996,21,83,0,0,0.0,0,1,0,0,0
9997,0,9,0,0,0.0,0,0,1,0,0
9998,24,32,0,0,0.0,38,1,0,0,0


In [None]:
X_train_sfs_knn = sfs_knn.transform(X_train)
feature_selector.k_fold_cv(pipe, X_train_sfs_knn, y_train)

2.085578338894409

In [None]:
X_test_sfs_knn = sfs_knn.transform(X_test)

In [None]:
pipe.fit(X_train_sfs_knn, np.log(y_train + 1))
y_pred = np.exp(pipe.predict(X_test_sfs_knn)) - 1
y_pred = np.maximum(0, y_pred)

In [None]:
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('mySubmission_knn.txt', index=False, header=False, sep='\t')

## Principal Component Regression (PCR)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pcr = LinearRegression()
pca = PCA()
feature_selector = FeatureSelector()
X_train_pca = pca.fit_transform(X_train)

sfs_pcr = feature_selector.stepwise_feature_selection(X_train_pca, y_train,
                                                      pcr, True,
                                                      n_features_to_select=20)

In [None]:
X_train_sfs_pcr = sfs_pcr.transform(X_train_pca)

In [None]:
feature_selector.k_fold_cv(pcr, X_train_sfs_pcr, y_train)

2.529566742804975

## KNN best subsets

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', KNN(n_neighbors = 10))])

feature_selector = FeatureSelector()

best_features = feature_selector.select_best_subset(X_train, y_train,
                                                      pipe, True,
                                                      n_features_to_select=15)


In [None]:
X_train_best_knn = X_train[X_train.columns[best_features]]
X_train_best_knn

Unnamed: 0,q08.out.ch.peak,q09.out.ch.peak,q09.out.ch.offpeak,q09.out.val.offpeak,q09.ch.sms,vas2_N,activation.zone_0,tariff.plan_3
0,70,77,0,0.0,0,1,0,0
1,23,16,0,0.0,1,1,0,0
2,131,343,0,0.0,0,1,0,0
3,167,261,0,0.0,0,1,0,0
4,0,0,0,0.0,0,1,0,0
...,...,...,...,...,...,...,...,...
9995,16,5,0,0.0,4,1,0,0
9996,21,83,0,0.0,0,1,0,0
9997,0,9,0,0.0,0,0,0,0
9998,24,32,0,0.0,38,1,0,0


In [None]:
feature_selector.k_fold_cv(pipe, X_train_best_knn, y_train)

2.0828821366942827

In [None]:
X_test_best_knn = X_test[X_test.columns[best_features]]

In [None]:
pipe.fit(X_train_best_knn, np.log(y_train + 1))
y_pred = np.exp(pipe.predict(X_test_best_knn)) - 1
y_pred = np.maximum(0, y_pred)

In [None]:
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('mySubmission_knn_best_subset.txt', index=False, header=False, sep='\t')

## TODO:
- Refactor code
- Try different values of K (of KNN) (cross validation? maybe too long to computr)
- Try to perform PCR
- Try Stepwise selection with different regressors
- Analyse residuals