In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

np.random.seed(1)

# Pre-process data

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [None]:
print train_data.shape
print test_data.shape

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data_label = train_data.pop("eyeDetection")

In [None]:
train_data_label.value_counts()

In [None]:
import pandas_profiling
pandas_profiling.ProfileReport(train_data)

In [None]:
plt.figure(figsize=(14,6))
train_data.boxplot(sym='k.')
plt.show()

In [None]:
from sklearn import preprocessing

scaler_std = preprocessing.StandardScaler()
train_data_norm = pd.DataFrame(scaler_std.fit_transform(train_data),columns=train_data.columns)

In [None]:
plt.figure(figsize=(14,6))
train_data_norm.boxplot(sym='k.')
plt.show()

In [None]:
from scipy import stats

print train_data_norm.shape
tmp = (np.abs(stats.zscore(train_data_norm)) < 3).all(axis=1)
print tmp
train_data_norm = train_data_norm[tmp]
train_data_label = train_data_label[tmp]
print train_data_norm.shape

In [None]:
plt.figure(figsize=(14,6))
train_data_norm.boxplot(sym='k.')
plt.show()

In [None]:
scaler_std2 = preprocessing.StandardScaler()
train_data_norm = pd.DataFrame(scaler_std2.fit_transform(train_data_norm),columns=train_data.columns)

In [None]:
plt.figure(figsize=(14,6))
train_data_norm.boxplot(sym='k.')
plt.show()

In [None]:
test_index = test_data.pop("index")

test_data_norm =  pd.DataFrame(scaler_std.transform(test_data),columns=test_data.columns)
test_data_norm =  pd.DataFrame(scaler_std2.transform(test_data_norm),columns=test_data.columns)

In [None]:
plt.figure(figsize=(14,6))
test_data_norm.boxplot(sym='k.')
plt.show()

In [None]:
plt.figure(figsize=(14,6))
test_data_norm.boxplot(sym='k.')
plt.ylim([-4,6])
plt.show()

In [None]:
pandas_profiling.ProfileReport(train_data_norm)

# Model building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict

params = {'C':[0.001,0.01,0.1,1,10]}

clf_logistic = LogisticRegression()
GSCV = GridSearchCV(clf_logistic, params,cv=5)

predicted = cross_val_predict(GSCV, train_data_norm, train_data_label, cv=5)
print metrics.accuracy_score(train_data_label, predicted)

GSCV = GridSearchCV(clf_logistic, params,cv=5)
GSCV.fit(train_data_norm,train_data_label)

print GSCV.best_score_
print GSCV.best_estimator_

In [None]:
for feature,weight in zip(train_data_norm.columns,GSCV.best_estimator_.coef_[0]):
    print "%s %f"%(feature,weight)

### Make a submission to Kaggle!

In [None]:
GSCV.best_estimator_.fit(train_data_norm,train_data_label)
predictions = GSCV.best_estimator_.predict(test_data_norm)

In [None]:
submission = pd.DataFrame()
submission["index"] = test_index
submission["eyeDetection"] = predictions

submission.to_csv("mysubmission.csv",index=False)

### Very simple exercise!

In [None]:
from sklearn.linear_model import Perceptron

### Feature engineering!

In [None]:
train_data_norm["newfeat1"] = train_data_norm.apply(np.var,axis=1)
train_data_norm["newfeat2"] = train_data_norm.apply(np.mean,axis=1)
train_data_norm["newfeat3"] = train_data_norm.apply(np.sum,axis=1)
train_data_norm["newfeat4"] = train_data_norm.apply(np.min,axis=1)
train_data_norm["newfeat5"] = train_data_norm.apply(np.max,axis=1)

In [None]:
params = {'C':[0.001,0.01,0.1,1,10]}

GSCV = GridSearchCV(clf_logistic, params,cv=5)
GSCV.fit(train_data_norm,train_data_label)

print GSCV.best_score_
print GSCV.best_estimator_