In [30]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [31]:
def KFoldSplit(X, y):
    kf = KFold()
    kf.get_n_splits(X)

    for train_index, test_index in kf.split(X):
        X_train = [X[i] for i in train_index]
        X_test = [X[i] for i in test_index]
        y_train = [y[i] for i in train_index]
        y_test = [y[i] for i in test_index]

        yield np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test)
        
def preprocess(filename, sep=','):
    # load data
    df = pd.read_csv(filename, sep=sep)
    df = df.dropna()
    # shuffle
    df = shuffle(df)

    X = df
    Y_color = df.iloc[:, 6]
    Y_texture = df.iloc[:, 7]
    X.drop(['color', 'texture', 'image', 'id', 'x', 'y', 'w', 'h'], axis=1, inplace=True)
    
#     encoding y
    Y_color = np.array(Y_color)
    Y_texture = np.array(Y_texture)
    
#     print(X.head())
    
    return np.array(X), Y_color, Y_texture

def preprocess_test(filename, sep=','):
    # load data
    df = pd.read_csv(filename, sep=sep)

    X = df
    X.drop(['image', 'id', 'x', 'y', 'w', 'h'], axis=1, inplace=True)
#     print(X.isna().any(axis=1).to_string())
    X = X.dropna()
    return np.array(X)

def KFold_train(X, y):
    balanced_accuracies = []
    lr_accu = []
    all_true = []
    svm_pred = []
    lr_pred = []

    for X_train, X_test, y_train, y_test in KFoldSplit(X, y):
        
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
#         pca = PCA(n_components='mle', svd_solver = 'full')# adjust yourself
#         pca.fit(X_train)
#         X_train = pca.transform(X_train)
#         X_test = pca.transform(X_test)
        
#         X_train = X_train[:, 27:]
#         X_test = X_test[:, 27:]
#         C_range = np.logspace(-2, 10, 13)
#         gamma_range = np.logspace(-9, 3, 13)
#         param_grid = dict(gamma=gamma_range, C=C_range)
#         cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
#         grid = GridSearchCV(svm.SVC(class_weight='balanced'), param_grid=param_grid, scoring="balanced_accuracy", cv=cv)
#         grid.fit(X_train, y_train)
        
#         print("The best parameters are %s with a score of %0.2f"
#       % (grid.best_params_, grid.best_score_))

        clf = svm.SVC(kernel='rbf', class_weight='balanced')
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        accuracy = round(balanced_accuracy_score(y_test, y_pred), 3)
        print("SVM Balanced Accuracy: ", accuracy)
        balanced_accuracies.append(accuracy)
        svm_pred.extend(y_pred)
        
        lr = LogisticRegression(class_weight='balanced', max_iter=10000)
        lr.fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        lr_accuracy = round(balanced_accuracy_score(y_test, y_pred), 3)
        print("LR Balanced Accuracy: ", lr_accuracy)
        lr_accu.append(lr_accuracy)
        lr_pred.extend(y_pred)
        
        all_true.extend(y_test)
        
    print('SVM Avg Balanced Accuracy: ', round(np.mean(balanced_accuracies), 3))
    print('LR Avg Balanced Accuracy: ', round(np.mean(lr_accu), 3))
    
    print('SVM Confusion Matrix: ', confusion_matrix(all_true, svm_pred))     
    print('LR Confusion Matrix: ', confusion_matrix(all_true, lr_pred)) 

In [32]:
def train(X_train, X_test, y_train):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    clf = svm.SVC(kernel='rbf',class_weight='balanced')
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    return y_pred

def output(filename, y_pred):
    y_pred = np.array(y_pred)
    with open(filename, "w") as wp:
        for pred in y_pred:
            wp.write("{}\n".format(pred))


In [33]:
X, Y_color, Y_texture = preprocess("data/data_train.csv")
X_test = preprocess_test("data/data_test.csv")

In [34]:
Y_color_pred = KFold_train(X, Y_color)
# Y_texture_pred = KFold_train(X, Y_texture)

# y_pred = train(X, X_test, Y_color)
# y_pred = train(X, X_test, Y_texture)


SVM Balanced Accuracy:  0.186
LR Balanced Accuracy:  0.141
SVM Balanced Accuracy:  0.196
LR Balanced Accuracy:  0.137
SVM Balanced Accuracy:  0.182
LR Balanced Accuracy:  0.137
SVM Balanced Accuracy:  0.203
LR Balanced Accuracy:  0.157
SVM Balanced Accuracy:  0.213
LR Balanced Accuracy:  0.217
SVM Avg Balanced Accuracy:  0.196
LR Avg Balanced Accuracy:  0.158
SVM Confusion Matrix:  [[  0   2   0   1   2   1   0   0   0   0   0   0]
 [  0  33   8  14   7   5   0   0   0   0   6   0]
 [  0   8  30   8   5  15   0   0   0   0  21   0]
 [  0  23  23  58  18  74   0   0   0   0  22   0]
 [  0  14  17  21  13  18   0   0   0   0  18   0]
 [  0  20  35  37  23 152   0   0   0   0  29   0]
 [  0   0   1   6   2   4   0   0   0   0   5   0]
 [  0   1   1   3   5   1   0   0   0   0   6   0]
 [  0   3   1   2   0   0   0   0   0   0   0   0]
 [  0   5   0   6   2   0   0   0   0   0   3   0]
 [  0  25  32  49  30  29   0   0   0   0 311   0]
 [  0   3   1   2   2   3   0   0   0   0   9   0]]
LR

In [None]:
output("pred.csv", y_pred)

In [None]:
Y_color_pred