In [None]:
import pandas as pd
import pickle
import numpy as np
import os

from google.colab import files
import io
import time

In [None]:
# mounting google drive 
from google.colab import drive
drive.mount('/content/drive')

# specify base directory for files
baseDir = '/content/drive/My Drive/ETHZ/AML/Project_2/'

# Importing data

In [None]:
# Importing original data & imputing
D_X_df = pd.read_csv(baseDir + 'X_train.csv', index_col = 'id')
D_y_df = pd.read_csv(baseDir + 'y_train.csv', index_col = 'id')
D_test_df = pd.read_csv(baseDir + 'X_test.csv', index_col = 'id')

D_X_df.head(20)

# Balance data

In [None]:
from sklearn.utils import resample

# Combine X and y sets
D_combined_df = D_X_df
D_combined_df['y'] = D_y_df

# Separate into class sets
D_class0_df = D_combined_df[D_combined_df.y == 0]
D_class1_df = D_combined_df[D_combined_df.y == 1]
D_class2_df = D_combined_df[D_combined_df.y == 2]

# Upsample minority classes
D_class0_resampled = resample(D_class0_df, replace=True, n_samples=3600, random_state=123)
D_class2_resampled = resample(D_class2_df, replace=True, n_samples=3600, random_state=123)

# Recombine
D_combined_df = D_class0_resampled.append(D_class1_df).append(D_class2_resampled)

# Separate X and y again
D_y_df = pd.DataFrame(D_combined_df['y'].sort_index())
D_X_df = D_combined_df.drop('y', axis='columns').sort_index()

# Convert to np & normalize to mean=0 , std=1

In [None]:
from sklearn import preprocessing

D_X = np.array(D_X_df)
D_y = np.ravel(np.array(D_y_df))
D_test = np.array(D_test_df)

scaler = preprocessing.StandardScaler()
scaler.fit_transform(D_X)

D_X_stand = scaler.transform(D_X)
D_test_stand = scaler.transform(D_test) # scaling accoring to train set or to test set?

# SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, GridSearchCV

svm = SVC()
params = {
    'C':[1, 5],
    'kernel':('linear', 'poly', 'rbf', 'sigmoid'),
    'degree':[1, 5],
    'gamma':('scale', 'auto')
}

model = GridSearchCV(estimator=svm, param_grid=params, scoring='balanced_accuracy', cv=10)
model.fit(D_X_stand, D_y)

print("Mean test scores:")
print(model.cv_results_['mean_test_score'])

print("Best parameters:")
print(model.best_params_)

# stratified K-fold

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13) # random_state is seed value

ind_splits_skf = list(skf.split(D_X_stand,D_y))

# Feature selection - TODO?

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

selector = SelectFromModel(estimator=LogisticRegression(max_iter=1000), max_features= 60).fit(D_X_stand,D_y)

D_X_stand = selector.transform(D_X_stand)
D_test_stand = selector.transform(D_test_stand)

# Fitting data - one vs one classifier

In [None]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier

folds, _ = np.shape(ind_splits_skf)
models = []

for f in range(folds):
  X_fold_fit = D_X_stand[ind_splits_skf[f][0]]
  y_fold_fit = D_y[ind_splits_skf[f][0]]

  model = OneVsOneClassifier(LinearSVC(random_state=1, multi_class = 'ovr', class_weight = 'balanced', max_iter = 100000)).fit(X_fold_fit, y_fold_fit)

  models.append(model)

# Prediction & BMAC on Stratified Kfold - pick model with median value

In [None]:
from sklearn.metrics import balanced_accuracy_score

BMAC = []
for index, m in enumerate(models):

  X_fold_pred = D_X_stand[ind_splits_skf[index][1]]
  y_fold_pred = D_y[ind_splits_skf[index][1]]

  BMAC.append(balanced_accuracy_score(y_fold_pred, m.predict(X_fold_pred)))

print(BMAC)

medDif = np.abs(BMAC - np.median(BMAC))
index_best = np.where(np.min(medDif) == medDif)[0][0]

bestModel = models[index_best]

[0.6583333333333333, 0.6416666666666667, 0.6944444444444445, 0.6398148148148147, 0.6657407407407407, 0.6990740740740741, 0.6101851851851853, 0.6361111111111111, 0.6324074074074074, 0.6962962962962963]


In [None]:
index_best = np.where(np.median(BMAC))[0]
index_best

medDif = np.abs(BMAC - np.median(BMAC))
np.where(np.min(medDif) == medDif)[0][0]

0

# Predict

In [35]:
result_r = model.predict(D_test_stand)
result_r = np.transpose(np.array(result_r))

print(result_r.shape)

(4100,)


# Convert NP arrays back to Pandas dataframe

In [36]:
D_test_df.index
D_y_df.columns

df_r = pd.DataFrame(result_r,index=D_test_df.index.astype(int))
df_out = pd.concat([df_r], axis=1, sort=False)
df_out.columns = D_y_df.columns
df_out.head()

Unnamed: 0_level_0,y
id,Unnamed: 1_level_1
0,1
1,1
2,0
3,0
4,1


# Saving dataframe

In [37]:
import datetime
now = datetime.datetime.now()
unixTime = round(now.timestamp())

name = f'{baseDir}predictions/pred_{unixTime}.csv'
print(f'File was saved under {name}')
df_out.to_csv(name, index=True, header = True, float_format='%.3f') #, compression='zip')

File was saved under /content/drive/My Drive/ETHZ/AML/Project_2/predictions/pred_1604071245.csv
