In [85]:
import pandas as pd
import numpy as np
import itertools
from sklearn import preprocessing, svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")
from sklearn.utils import shuffle


In [86]:
params = [
    {
        'svm__kernel': ['rbf'],
        'svm__gamma': [x ** y for x, y in zip([2] * 31, range(-15, 16, 1))],
        'svm__C': [x ** y for x, y in zip([2] * 31, range(-15, 16, 1))]
    }
]

params_test = [
    {
        'svm__kernel': ['rbf'],
        'svm__gamma': [1],
        'svm__C': [1]
    }
]

pipe = Pipeline(steps=[('normalize', preprocessing.Normalizer()), ('svm', svm.SVC())])

In [3]:
rawdata = pd.read_csv("../csv/expanded_dataset_v1.csv", encoding = 'utf8')

In [4]:
def search_train(data, N, c, pipe, params):
    sample_size = int(c * len(data.index))
    results = []
    search = GridSearchCV(pipe, params, n_jobs=-1, verbose=1, scoring='f1_macro', cv=10)
    for i in xrange(N):
        print("Run {} of {} with {}% of data".format(i+1, N, c))
        rows = np.random.choice(data.index.values, sample_size)
        
        train = data.ix[rows]
        test = data.ix[~data.index.isin(train.index)]
        
        indexes = train.axes[0]
        X = train.drop(['Class'], axis=1)
        Y = train['Class']
        classifier = search.fit(X, Y)
        
        x = test.drop(['Class'], axis=1)
        y = test['Class']
        
        y_pred = classifier.predict(x)
        
        res = {
            'indexes': indexes,
            'classifier': classifier,
            'y': y,
            'y_pred': y_pred,
            'N': N,
            'c': c
        }
        results.append(res)
    return results
        

In [6]:
res5 = search_train(rawdata, 1, 0.8, pipe, params)

Run 1 of 1 with 0.8% of data
Fitting 10 folds for each of 961 candidates, totalling 9610 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1688 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 3088 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 4888 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 7088 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 9466 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 9595 out of 9610 | elapsed:  1.6min remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 9610 out of 9610 | elapsed:  1.6min finished


In [7]:
def metric_from_result(res, metric_func, **kwargs):
    return metric_func(res['y'], res['y_pred'], **kwargs)

In [8]:
[metric_from_result(x, accuracy_score) for x in res5]

[0.86864406779661019]

In [9]:
[metric_from_result(x, f1_score, average='macro') for x in res5]

[0.85739599342310768]

In [10]:
[metric_from_result(x, confusion_matrix) for x in res5]

[array([[ 97,  15,   0],
        [ 16, 233,  15],
        [  5,  11,  80]])]

In [11]:
res5[0]['classifier'].best_params_

{'svm__C': 8192, 'svm__gamma': 32, 'svm__kernel': 'rbf'}

In [95]:
def run_experiment(data, N, pipe, params):
    cs = np.arange(0.1,0.2,0.1)
    rows = []
    for c in cs:
        run_res = search_train(data, N, c, pipe, params)
        for r in run_res:
            cm = metric_from_result(r, confusion_matrix)
            
            r_dict = {
                "accuracy": metric_from_result(r, accuracy_score),
                "f1": metric_from_result(r, f1_score, average='macro'),
                "precision": metric_from_result(r, precision_score, average='macro'),
                "recall": metric_from_result(r, recall_score, average='macro'),
                "c": c,
                "best_params": str(r['classifier'].best_params_)
            }
            
            for i in range(len(cm)):
                for j in range(len(cm[0])):
                    key = "cm_{}{}".format(i, j)
                    value = cm[i][j]
                    r_dict.update({key:value})
                    
            rows.append(r_dict)
    return pd.DataFrame(rows)
        

In [96]:
res = run_experiment(rawdata, 1, pipe, params_test)

Run 1 of 1 with 0.1% of data
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


In [97]:
res

Unnamed: 0,accuracy,best_params,c,cm_00,cm_01,cm_02,cm_10,cm_11,cm_12,cm_20,cm_21,cm_22,f1,precision,recall
0,0.707212,"{'svm__C': 1, 'svm__kernel': 'rbf', 'svm__gamm...",0.1,49,189,0,0,510,0,0,83,98,0.611149,0.884058,0.58244


In [78]:
str({'hello': 'there'})

"{'hello': 'there'}"