# Simple Prediction

In [65]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from matplotlib import pyplot as plt
%matplotlib inline

In [31]:
train = pd.read_csv('clean_data/train_eda.csv', index_col=0)
test = pd.read_csv('clean_data/test_eda.csv', index_col=0)

## Model Training and Validation

In [32]:
y = train.TARGET
X = train.drop(columns=['TARGET'])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [35]:
X_train_index = X_train.index
X_test_index = X_test.index

In [36]:
# features names
features = list(train.columns)

In [38]:
imputer = Imputer(strategy='median')
scaler = MinMaxScaler(feature_range=(0,1))

In [39]:
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [40]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [41]:
print('Train: ', X_train.shape)
print('Test:  ', X_test.shape)

Train:  (52500, 294)
Test:   (22500, 294)


In [58]:
lr = LogisticRegression(penalty='l1')
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [61]:
y_hat = lr.predict(X_test)

In [62]:
roc_auc_score(y_test, y_hat)

0.5016996766102978

In [63]:
scores = cross_val_score(lr, X_test, y_test, cv=5, scoring='roc_auc')
scores.mean()

0.74431314966921058

## Write Submission

In [46]:
submit = pd.DataFrame(y_hat, index=X_test_index, columns=['TARGET'])
submit.to_csv('submit/lr_prediction.csv', index=True)