## Helper functions

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import make_scorer

In [2]:
# Removes from X the specified features
def rm_feature(X, omitted_features):
    X = X.drop(columns=omitted_features)
    return X

In [3]:
def get_categorical_features(X):
    column_types = X.dtypes
    categorical_features = column_types[column_types == 'object'].index.tolist()
    return categorical_features


In [4]:
def one_hot_encoding(X, categorical_features):
    X_one_hot = pd.get_dummies(X, columns=categorical_features,dtype=int)
    return X_one_hot


In [5]:
def rmsle(y, y_pred, **kwargs):
    y_true = np.exp(y) - 1
    y_pred = np.exp(y_pred) - 1
    y_pred = np.maximum(0, y_pred)
    return np.sqrt(np.mean((np.log(1 + y_true) - np.log(1 + y_pred))**2))

In [6]:
def stepwise_feature_selection(X, y, estimator, forward,
                               n_features_to_select = 10):

    score_func = make_scorer(rmsle, greater_is_better=False)
    # score_func = None
    sfs = SFS(
        estimator,
        k_features = n_features_to_select,
        forward=forward,
        floating=False,
        scoring = score_func,
        cv=10,
        n_jobs=-1
    )
    sfs.fit(X, np.log(y+1))
    return sfs


In [7]:
def k_fold_cv(model, X, y):
    score_func = make_scorer(rmsle)
    scores = cross_val_score(model,
        X,
        np.log(y + 1),
        scoring = score_func,
        cv = 10,
        n_jobs = -1)
    return np.mean(scores)

## Test

In [8]:
train = pd.read_csv("phone_train.csv")
test = pd.read_csv("phone_validation.csv")

# Encode train and test sets
X_train = rm_feature(train, ['y'])
y_train = train['y']
X_test = test

In [9]:
cat_features = get_categorical_features(X_train)
cat_features = cat_features + ['activation.zone', 'activation.channel', 'tariff.plan']
cat_features

['payment.method',
 'sex',
 'vas1',
 'vas2',
 'activation.zone',
 'activation.channel',
 'tariff.plan']

In [10]:
X_train['label'] = 'train'
X_test['label'] = 'test'

concat_X = pd.concat([X_train, X_test])
concat_X = one_hot_encoding(concat_X, cat_features + ['label'])

X_train = concat_X[concat_X['label_train']==1]
X_test = concat_X[concat_X['label_test']==1]

X_train = X_train.drop(['label_train','label_test'], axis=1)
X_test = X_test.drop(['label_train','label_test'], axis=1)


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Columns: 119 entries, age to tariff.plan_8
dtypes: float64(19), int64(100)
memory usage: 9.2 MB


### Forward stepwise selection

In [12]:
model = LinearRegression()

sfs = stepwise_feature_selection(X_train, y_train, model, True,
                                 n_features_to_select="best")
X_train_sfs = sfs.transform(X_train)

In [13]:
X_train[list(sfs.k_feature_names_)]

Unnamed: 0,age,q01.out.ch.peak,q01.out.val.peak,q01.out.val.offpeak,q01.in.ch.tot,q01.in.dur.tot,q02.out.val.peak,q03.out.ch.peak,q04.out.ch.peak,q04.out.dur.peak,...,activation.zone_1,activation.zone_2,activation.zone_3,activation.zone_4,activation.channel_6,activation.channel_8,activation.channel_9,tariff.plan_3,tariff.plan_4,tariff.plan_8
0,34.29,82,30.1879,0.0000,57,4154,29.6808,107,156,14056,...,0,0,1,0,0,0,0,0,0,0
1,22.78,0,0.0000,0.0000,0,0,0.0000,0,45,2235,...,0,1,0,0,0,0,1,0,0,1
2,26.95,244,220.1713,0.0000,213,16351,332.0048,426,280,62654,...,0,0,1,0,0,0,0,0,0,0
3,46.09,331,119.8330,0.0000,555,41517,125.3498,248,233,20339,...,0,1,0,0,0,0,0,0,0,0
4,23.77,0,0.0000,0.0000,0,0,0.0000,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,23.01,0,0.0000,0.0000,0,0,0.0000,6,31,1394,...,0,1,0,0,0,0,0,0,0,1
9996,44.72,24,13.2316,2.3788,13,1113,6.0322,34,76,6824,...,0,1,0,0,0,0,0,0,0,0
9997,17.47,0,0.0000,0.0000,0,0,0.0000,0,0,0,...,1,0,0,0,0,0,0,0,0,1
9998,21.67,0,0.0000,0.0000,0,0,0.0000,0,0,0,...,0,1,0,0,0,0,1,0,0,1


In [14]:
# Test model with subset features
k_fold_cv(LinearRegression(), X_train_sfs, y_train)

2.4343114376108934

In [15]:
# Test model with all features
k_fold_cv(LinearRegression(), X_train, y_train)

2.4643647357926572

### Prediction on X_test

In [16]:
X_test_sfs = sfs.transform(X_test)

In [17]:
model = LinearRegression()
model.fit(X_train_sfs, np.log(y_train + 1))
y_pred = np.exp(model.predict(X_test_sfs)) - 1
y_pred = np.maximum(0, y_pred)

In [18]:
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('mySubmission.txt', index=False, header=False, sep='\t')

## KNN regression

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor as KNN

In [35]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', KNN(n_neighbors = 5))])

sfs_knn = stepwise_feature_selection(X_train, y_train, pipe, True,
                                 n_features_to_select=10)



In [49]:
X_train[list(sfs_knn.k_feature_names_)]

Unnamed: 0,q04.out.ch.offpeak,q07.ch.sms,q09.out.ch.peak,q09.out.ch.offpeak,q09.out.dur.offpeak,q09.out.val.offpeak,activation.zone_0,activation.channel_6,tariff.plan_3,tariff.plan_4
0,0,1,77,0,0,0.0,0,0,0,0
1,0,0,16,0,0,0.0,0,0,0,0
2,0,1,343,0,0,0.0,0,0,0,0
3,0,0,261,0,0,0.0,0,0,0,0
4,0,0,0,0,0,0.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,0,2,5,0,0,0.0,0,0,0,0
9996,12,0,83,0,0,0.0,0,0,0,0
9997,0,0,9,0,0,0.0,0,0,0,0
9998,0,10,32,0,0,0.0,0,0,0,0


In [50]:
X_train_sfs_knn = sfs_knn.transform(X_train)
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', KNN(n_neighbors = 10))])
k_fold_cv(pipe, X_train_sfs_knn, y_train)

2.1104713096906798

In [51]:
X_test_sfs_knn = sfs_knn.transform(X_test)

In [52]:
pipe.fit(X_train_sfs_knn, np.log(y_train + 1))
y_pred = np.exp(pipe.predict(X_test_sfs_knn)) - 1
y_pred = np.maximum(0, y_pred)

In [53]:
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('mySubmission_knn.txt', index=False, header=False, sep='\t')

## TODO:
- Refactor code
- Try different values of K (of KNN) (cross validation? maybe too long to computr)
- Try to perform PCR
- Try Stepwise selection with different regressors