In [None]:
# Import required modules
import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib as plt

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [None]:
#import training data set
train = pd.read_csv("../data/raw/train.csv.zip", compression="zip", index_col='id')
meta_data = pd.read_csv("../data/interim/training_metadata.csv")

In [None]:
# Names
cat_names = meta_data[meta_data['Data Type']=='object']['Variable']
cont_names = meta_data[(meta_data['Data Type']=='float64') & 
                       (meta_data['Variable']!='loss')]['Variable']

# Merge transformed categorical data with continuous
cont_names_to_drop =['cont12' # highly correlated to cont11
                     ,'cont9' # highly correlated to cont1
                    ]
cont_names_final = cont_names[~cont_names.isin(cont_names_to_drop)]

In [None]:
import category_encoders as ce
encoder = ce.BinaryEncoder()

categorical_data = encoder.fit_transform(train[cat_names])
continuous_data = train[cont_names]

train_final1 = pd.concat([train[cont_names], categorical_data], axis=1)
train_final1.to_csv("../data/interim/train_binary_encoded.csv", index=False)

# SGD Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.grid_search import GridSearchCV

scorer = make_scorer(mean_absolute_error)
clf = linear_model.SGDRegressor()

scores = cross_val_score(clf, X, y, cv=10, scoring=scorer)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
param_grid = [{'loss':['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
               'penalty':['none', 'l2', 'l1', 'elasticnet']
              }]

gs = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, scoring=scorer)
gs = gs.fit(X, y)

print("Best F-Score: ", gs.best_score_)
print("Best Parameters: ", gs.best_params_)
print("Best Estimator: ", gs.best_estimator_)

## TPOT

In [None]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

train_final1 = pd.read_csv("../data/interim/train_binary_encoded.csv")
X=train_final1#.values
y=train['loss']#.values

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.1, test_size=0.1)

tpot = TPOTRegressor(generations=5, population_size=20, verbosity=3, scoring='mean_absolute_error')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('../models/tpot_pipeline.py')