# Simple Prediction

In [77]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score

from tempfile import mkdtemp
from shutil import rmtree

from matplotlib import pyplot as plt
%matplotlib inline

In [78]:
def submit(model, test_data, file_name='prediction.csv', predict_proba=False):
    if predict_proba:
        y_hat = model.predict_proba(test_data)[:, 1]
    else:
        y_hat = model.predict(test_data)
        
    submit = pd.DataFrame(y_hat, index=test.index, columns=['TARGET'])
    submit.to_csv(f'submit/{file_name}', index=True)

In [79]:
train = pd.read_csv('clean_data/train_eda.csv', index_col=0)
test = pd.read_csv('clean_data/test_eda.csv', index_col=0)

In [101]:
estimators = [
    ('impute', Imputer(strategy='median')),
    ('scale', MinMaxScaler(feature_range=(0,1))),
    ('logistic regression', LogisticRegression(penalty='l1'))
]

p1 = Pipeline(estimators)

In [102]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=50,
    n_jobs=-1
)

estimators = [
    ('impute', Imputer(strategy='median')),
    ('scale', MinMaxScaler(feature_range=(0,1))),
    ('random_forest', rf)
]

p2 = Pipeline(estimators)

In [103]:
estimators = [
    ('voting', VotingClassifier(estimators=[
        ('p1', p1),
        ('p2', p2)
    ],
    voting='soft',
    weights=[.7, .3]))
]

pipe = Pipeline(estimators)

In [90]:
#cachedir = mkdtemp()
#pipe = Pipeline(estimators, memory=cachedir)

In [91]:
y_train = train.TARGET
X_train = train.drop(columns='TARGET')

In [105]:
scores = cross_val_score(pipe, X_train, y_train, cv=3, scoring='roc_auc')
scores.mean()

0.74377291422214975

In [107]:
y = train.TARGET
X = train.drop(columns='TARGET')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

pipe.fit(X_train, y_train)

y_hat = pipe.predict_proba(X_train)[:, 1]
roc_auc_score(y_train, y_hat)

0.99110593383218792

## Submit

In [74]:
submit(pipe, test, 'logistic_regression_baseline.csv', predict_proba=True)