In [22]:
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
import numpy as np
from sklearn import svm
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv("../data/X_train.csv").drop('id', axis=1)
test = pd.read_csv("../data/X_test.csv").drop('id', axis=1)
y = pd.read_csv("../data/y_train.csv").y

In [3]:
scaler = StandardScaler().fit(pd.concat([train,test]))
train_ = scaler.transform(train)
train_ = pd.DataFrame(train_, columns=train.columns)
test_ = scaler.transform(test)
test_ = pd.DataFrame(test_, columns=train.columns)
train_.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x990,x991,x992,x993,x994,x995,x996,x997,x998,x999
0,-1.813592,1.596333,-1.577493,1.31679,1.224979,0.48139,-0.260872,1.040312,1.630821,4.411101,...,-0.41706,1.870623,-1.150704,-0.370867,3.883882,-2.434579,-3.003603,2.826932,-2.295371,2.503751
1,0.621952,1.622341,0.27669,0.507426,0.946101,0.113776,0.648369,0.924533,-0.439037,-0.073765,...,0.311276,-0.489654,0.176729,1.160717,-2.445714,0.433385,0.124162,-1.343699,0.363684,0.397675
2,-0.694772,-1.264971,0.192493,1.198083,1.785248,-0.303032,1.396465,0.768764,0.407531,0.290961,...,-0.284238,-0.612937,0.10679,0.082038,-1.647803,1.015443,1.116481,-0.295231,0.702944,-0.530188
3,1.380466,-1.332979,-0.868456,-0.254339,-0.704426,-1.653814,0.64819,-0.436893,0.92444,1.398371,...,1.229356,1.427286,-0.992263,-0.233272,-1.001601,-1.132879,-0.929947,0.450716,-0.30182,1.32629
4,-0.461229,-0.997129,0.816687,-1.349971,-0.346954,-0.71683,0.950842,-0.604793,-0.86613,-0.634463,...,-0.228485,-2.000328,0.28205,0.640746,-0.82214,0.161272,0.673704,-0.630556,-0.012413,0.127628


In [4]:
# Get balanced accuracy score
def score(true, pred):
    return balanced_accuracy_score(true, pred)

# Oversample using SMOTE
def oversample(x_data, y_data):
    smote = SMOTE(ratio='not majority', random_state=42)
    return smote.fit_sample(x_data, y_data)

# Oversample and fit model for a CV split
def run_fold(x_train, y_train, x_test, y_test, model):
    # Oversample data
    #x_train_sm, y_train_sm = oversample(x_train, y_train)
    x_train_sm, y_train_sm = x_train, y_train
    # Fit model
    model.fit(x_train_sm, y_train_sm)
    y_train_pred = model.predict(x_train_sm)
    train_score = score(y_train_sm, y_train_pred)
    y_test_pred = model.predict(x_test)
    test_score = score(y_test, y_test_pred)
    return test_score, train_score

def cross_validate(x_data, y_data, model, variable):
    # Split data into folds
    n_splits = 10
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    folds = kf.split(x_data)
    test_scores = []
    train_scores = []
    times = []
    split = 0
    for train_index, test_index in folds:
        split += 1
        #print('Running split {}/{}'.format(split, n_splits))
        x_train = x_data[train_index]
        y_train = y_data[train_index]
        x_test = x_data[test_index]
        y_test = y_data[test_index]
        start_time = time.time()
        test_score, train_score = run_fold(x_train, y_train, x_test, y_test, model)
        end_time = time.time()
        total_time = round(end_time-start_time, ndigits=0)
        test_scores.append(test_score)
        train_scores.append(train_score)
        times.append(end_time-start_time)
        #print('Test score = {}\nTrain score = {}\nTime = {}s\n'.format(test_score, train_score, total_time))
    print('Average test score: {}\nAverage train score: {}\nTotal time: {}s'.format(np.mean(test_scores), np.mean(train_scores), np.sum(times)))
    return test_scores, train_scores, times

In [16]:
def cross_validate_pca(x_data, y_data, model, i):
    # Split data into folds
    n_splits = 10
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    folds = kf.split(x_data)
    test_scores = []
    train_scores = []
    times = []
    split = 0
    for train_index, test_index in folds:
        split += 1
        #print('Running split {}/{}'.format(split, n_splits))
        x_train = x_data[train_index]
        y_train = y_data[train_index]
        x_test = x_data[test_index]
        y_test = y_data[test_index]
        pca = PCA(n_components = np.round(i,2))
        pca.fit(pd.concat([pd.DataFrame(x_train),pd.DataFrame(x_test)]).values) 
        x_train_ = pd.DataFrame(pca.transform(x_train))
        x_test_ = pd.DataFrame(pca.transform(x_test))
        start_time = time.time()
        test_score, train_score = run_fold(x_train_, y_train, x_test_, y_test, model)
        end_time = time.time()
        total_time = round(end_time-start_time, ndigits=0)
        test_scores.append(test_score)
        train_scores.append(train_score)
        times.append(end_time-start_time)
        #print('Test score = {}\nTrain score = {}\nTime = {}s\n'.format(test_score, train_score, total_time))
    print('Average test score: {}\nAverage train score: {}\nTotal time: {}s'.format(np.mean(test_scores), np.mean(train_scores), np.sum(times)))
    return test_scores, train_scores, times

In [None]:
rf = RandomForestClassifier(n_estimators=100)
test_scores, train_scores, times = cross_validate(train_.values, y.ravel(), rf)

In [None]:
poly = PolynomialFeatures(2,interaction_only = True)
poly = poly.fit(pd.concat([train_,test_]).values)

In [None]:
poly_tr = pd.DataFrame(poly.transform(train_))
poly_ts = pd.DataFrame(poly.transform(test_))

In [None]:
poly_tr.shape

### PCA

In [5]:
pca = PCA(n_components = 0.74)
pca.fit(pd.concat([train_,test_]).values)  
len(pca.explained_variance_ratio_)
# pca = PCA(n_components = 0.8)
# pca.fit(pd.concat([poly_tr,poly_ts]).values)  
# len(pca.explained_variance_ratio_)

158

In [6]:
components = pca.transform(train_)
components_df = pd.DataFrame(components)

In [None]:
components_df.iloc[:,0:2].plot.scatter(x=0,y=1,c=y)

In [17]:
def scores_pca(tr,ts,y,clf):
    test_scores = list()
    train_scores = list()
    times_iters = list()
    for i in np.arange(0.7,0.9,0.01,dtype=float):
        print('Start of {}'.format(np.round(i,2)))
        test_scores, train_scores, times = cross_validate_pca(tr.values, y.ravel(), clf, i)
        test_scores.append(np.mean(test_scores))
        train_scores.append(np.mean(train_scores))
        times_iters.append(np.sum(times))
        print('End of {}, time: {}s'.format(np.round(i,2),np.sum(times)))
    return test_scores, train_scores, times_iters

In [11]:
clf = svm.SVC(gamma = 'scale' ,kernel = 'rbf', decision_function_shape='ovo', cache_size=3000, random_state = 42, class_weight = 'balanced')

In [18]:
test_scores, train_scores, times_iters = scores_pca(train_,test_,y,clf)

Start of 0.7
Average test score: 0.6956737045285502
Average train score: 0.808815296826331
Total time: 67.50490593910217s
End of 0.7, time: 67.50490593910217s
Start of 0.71
Average test score: 0.6900015698639371
Average train score: 0.8116032885591732
Total time: 72.07505416870117s
End of 0.71, time: 72.07505416870117s
Start of 0.72
Average test score: 0.6912694428369373
Average train score: 0.8158177683694114
Total time: 78.88805198669434s
End of 0.72, time: 78.88805198669434s
Start of 0.73
Average test score: 0.6904153009117893
Average train score: 0.8193354633369117
Total time: 85.02613401412964s
End of 0.73, time: 85.02613401412964s
Start of 0.74
Average test score: 0.6948273467943548
Average train score: 0.8232173592620053
Total time: 93.89329981803894s
End of 0.74, time: 93.89329981803894s
Start of 0.75
Average test score: 0.6961116081260433
Average train score: 0.8280202202017992
Total time: 101.05408191680908s
End of 0.75, time: 101.05408191680908s
Start of 0.76
Average test sc

### Grid Search CV

In [24]:
pipe_steps = [('scaler', StandardScaler()), ('pca',PCA(n_components=0.79)), ('SVM', svm.SVC(kernel = 'rbf', 
                                                                decision_function_shape='ovo', 
                                                                cache_size=3000, random_state = 42,
                                                                class_weight = 'balanced'))]
pipeline = Pipeline(pipe_steps)

check_params = {
    'SVM__C':[0.1,0.5,1,10,30,50],
    'SVM__gamma':[0.001,0.005,0.01,0.05,0.07,0.1,0.5,1,5,10,'scale']
}

In [None]:
create_grid = GridSearchCV(pipeline, param_grid=check_params, cv=10, scoring='balanced_accuracy')
create_grid.fit(train,y)
print("Best fit")
print(create_grid.best_params_)

In [None]:
clf = svm.SVC(gamma = 'scale' ,kernel = 'rbf', decision_function_shape='ovo', cache_size=3000, random_state = 42, class_weight = 'balanced')
test_scores, train_scores, times = cross_validate(components_df.values, y.ravel(), clf)

#WITH OVERSAMPLING
# SVM 240 PCs (80.x%)
# Average test score: 0.6624038730845622
# Average train score: 0.9343450326600132
# Total time: 493.02259135246277s
# SVM PCs (90%)
# Average test score: 0.649990375723468
# Average train score: 0.9632699801869032
# Total time: 975.6408638954163s
# SVM 200 PCs (<80%)
# Average test score: 0.6607910083259538
# Average train score: 0.9231407317122418
# Total time: 410.38625621795654s
# SVM 238 PCs (80%)
# Average test score: 0.6625778192190811
# Average train score: 0.9326657068920167
# Total time: 489.1878070831299s

#WITH CLASS WEIGHTS
#80%
# Average test score: 0.6980772066781805
# Average train score: 0.8506625620915867
# Total time: 172.54561042785645s
#70%
# Average test score: 0.6946741811444423
# Average train score: 0.8101701484126517
# Total time: 80.19191813468933s
#75%
# Average test score: 0.6987988339325073
# Average train score: 0.8314035482105762
# Total time: 115.05265045166016s

In [None]:
# x_ov, y_ov = oversample(components_df, y)
# clf.fit(x_ov, y_ov)
clf.fit(components_df.values, y.ravel())

In [None]:
def plot_contours(ax, clf_, xx, yy, **params):
    Z = np.array([xx.ravel(), yy.ravel()] + [np.repeat(0, xx.ravel().size) for _ in range(236)]).T
    Z = clf_.predict(Z).reshape(xx.shape) 
    out = ax.contourf(xx, yy, Z, **params)
    return out

def make_meshgrid(x, y, h=.1):
    x_min, x_max = x.min() - 1 , x.max() + 1
    y_min, y_max = y.min() - 1 , y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
    return xx, yy


In [None]:
X0 = np.array(components_df.iloc[:,0])
X1 = np.array(components_df.iloc[:,1])
xx, yy = make_meshgrid(X0,X1)
len(xx.ravel())

In [None]:
X0 = np.array(components_df.iloc[:,0])
X1 = np.array(components_df.iloc[:,1])
xx, yy = make_meshgrid(X0,X1)

fig, ax = plt.subplots(figsize=(12,9))
fig.patch.set_facecolor('white')
cdict1={0:'lime',1:'deeppink', 2:'purple'}

Y_tar_list = y.tolist()
labels1 = Y_tar_list

labl1 = {0:'0', 1:'1', 2:'2'}
marker1= {0:'*',1:'d',2:'h'}
alpha1={0:.6, 1:0.3, 2:0.6}

for l1 in np.unique(labels1):
    ix1 = np.where(labels1==l1)
    ax.scatter(X0[ix1],X1[ix1], c=cdict1[l1], label=labl1[l1],s=70, 
               marker=marker1[l1], alpha=alpha1[l1])

ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=40,
          facecolors='none', edgecolors='navy', label='Support Vectors', alpha=0.5)

plot_contours(ax, classify, xx, yy, cmap='seismic', alpha=0.4)
plt.legend(fontsize=15)

plt.xlabel("1st Principal Component", fontsize=14)
plt.ylabel("2nd Principal Component", fontsize=14)

plt.show()

### TEST file

In [None]:
components_test = pca.transform(test_)
components_test_df = pd.DataFrame(components_test)

In [None]:
test_y = clf.predict(components_test_df)
file_test = "svm_pca_cw"

In [None]:
output = pd.DataFrame({'id':[float(i) for i in range(0,len(test_y))],'y':test_y})
output.to_csv("{}.csv".format(file_test),index=False)