In [26]:
pd.read_csv('./data/processed/y.csv')

Unnamed: 0,Promoted_or_Not
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,1


In [1]:
import os

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from joblib import dump, load

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [21]:
def load_processed_data():
    return {
        'x': pd.read_csv('./data/processed/x.csv'),
        'y': pd.read_csv('./data/processed/y.csv', index_col=None)
    }

dataset = load_processed_data()
x, y = dataset['x'], dataset['y']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    x, 
    y, 
    test_size=0.33, 
    random_state=42,
    stratify=y
)

ml_dataset = {
    'x_train': X_train,
    'x_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}

for n, d in ml_dataset.items():
    print(n, d.shape)

x_train (25669, 20)
x_test (12643, 20)
y_train (25669, 1)
y_test (12643, 1)


## Simple upsampling of the training data

In [4]:
mask = y_train == 1
pos_x, pos_y = X_train.loc[mask, :], y_train.loc[mask]

from sklearn.utils import resample

x_tr_re, y_tr_re = resample(pos_x, pos_y, n_samples=X_train.shape[0] - pos_x.shape[0])

#  check that we have same number of resampled positive class as negative
assert x_tr_re.shape == X_train.loc[~mask, :].shape

x_tr_re = pd.concat([x_tr_re, X_train.loc[~mask, :]], axis=0)
y_tr_re = pd.concat([y_tr_re, y_train.loc[~mask]], axis=0)

np.testing.assert_allclose(np.mean(y_tr_re), 0.5)

ValueError: Cannot index with multidimensional key

In [None]:
def fit_sklearn(model, X_train, y_train, X_test, y_test):
    print(X_train.shape, y_train.shape)
    print(X_test.shape, y_test.shape)
    
    model.fit(X_train, y_train)

    predict_train = model.predict(X_train)

    f1_train = f1_score(y_train, predict_train, average='micro')
    print("TRAIN_RMSE: %f" % (f1_train))
    
    predict_test = model.predict(X_test)
    f1_test = f1_score(y_test, predict_test,average='micro')
    print("TEST_RMSE: %f" % (f1_test))
    
    return model, {'train': f1_train, 'test': f1_test}

In [None]:
model, res = fit_sklearn(
    LogisticRegression(solver='liblinear', max_iter=1000, C=0.1, tol=0.00001),
    x_tr_re, 
    y_tr_re,
    X_test,
    y_test
)

predict_test = model.predict(X_test)
np.mean(predict_test)

In [None]:
model, res = fit_sklearn(
    LogisticRegression(solver='liblinear', max_iter=1000, C=0.1, tol=0.00001),
    X_train, 
    y_train,
    X_test,
    y_test
)

#  non resampled model just predicts 0 all the time 
predict_test = model.predict(X_test)
np.mean(predict_test)

## Class weight

In [None]:
model, res = fit_sklearn(
    LogisticRegression(solver='liblinear', max_iter=1000, C=0.1, tol=0.00001, class_weight='balanced'),
    X_train, 
    y_train,
    X_test,
    y_test
)

predict_test = model.predict(X_test)
np.mean(predict_test)

In [None]:
model, res = fit_sklearn(
    LogisticRegression(solver='liblinear', max_iter=1000, C=0.1, tol=0.00001, class_weight='balanced'),
    x_tr_re, 
    y_tr_re,
    X_test,
    y_test
)

predict_test = model.predict(X_test)
np.mean(predict_test)

## Gridsearch

In [None]:
search = GridSearchCV(
    LogisticRegression(solver='liblinear', max_iter=1000, C=0.1, tol=0.00001),
    param_grid={
        #'C': [0.01, 0.1, 0.5],
        'tol': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1],
    })

search.fit(x_tr_re, y_tr_re)

print(search.cv_results_.keys())
search.cv_results_['mean_test_score']

In [None]:
best = search.best_params_
best

In [None]:
d = pd.DataFrame(search.cv_results_['params'])
d.loc[:, 'mean_test_score'] = search.cv_results_['mean_test_score']

%matplotlib inline
d.plot(x='tol', y='mean_test_score')

## Let's try SMOTE

In [None]:
!pip install imblearn -q

In [None]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_tr_smote, y_tr_smote = sm.fit_sample(X_train, y_train)

In [None]:
fit_sklearn(
    LogisticRegression(solver='liblinear', max_iter=1000, C=10, tol=0.000001),
    x_tr_smote, 
    y_tr_smote,
    X_test,
    y_test
)

## Final model to disk

In [None]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_smote, y_smote = sm.fit_sample(x, y)

final, res = fit_sklearn(
    LogisticRegression(solver='liblinear', max_iter=1000, C=10, tol=0.000001),
    x_smote, 
    y_smote,
    x,
    y
)

In [None]:
os.makedirs('./models', exist_ok=True)

dump(final, './models/lr.joblib') 

In [None]:
!ls models