In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.metrics import balanced_accuracy_score 
bmac = balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier 

from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
%matplotlib notebook
%matplotlib inline
from matplotlib import pyplot as plt

In [3]:
X_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
X_test_file  = 'X_test.csv'

labels_df = pd.read_csv(y_train_file)
bare_df   = pd.read_csv(X_train_file)
test_df   = pd.read_csv(X_test_file)

bare_df.insert(0,'y',labels_df['y'])

labels_df = labels_df.set_index('id')
bare_df   = bare_df  .set_index('id')
test_df   = test_df  .set_index('id')

In [4]:
print('bare_df.shape =', bare_df.shape)
print('test_df.shape  =', test_df.shape)
print('y distribution: ')
print(bare_df['y'].value_counts())
print('We are told that \"Test set has the same class imbalance as the training set.\"')

bare_df.shape = (4800, 1001)
test_df.shape  = (4100, 1000)
y distribution: 
1    3600
2     600
0     600
Name: y, dtype: int64
We are told that "Test set has the same class imbalance as the training set."


In [5]:
def mean_std_normalise(df):
    return (df-df.mean())/df.std()

def min_max_normalise(df):
    return (2*df-df.min()-df.max())/(df.max()-df.min())

In [6]:
def oh(y):
    y0 = y == 0
    y1 = y == 1
    y2 = y == 2
    return np.stack((y0,y1,y2),axis=1)

In [6]:
raise Exception('do the oversampling after train/test split')
# Class count
count_1, count_2, count_0 = bare_df['y'].value_counts()

# Divide by class
df_0 = bare_df[bare_df['y'] == 0]
df_1 = bare_df[bare_df['y'] == 1]
df_2 = bare_df[bare_df['y'] == 2]
# Let's try balancing out the classes with over-sampling
df_0_over = df_0.sample(count_1, replace=True)
df_2_over = df_2.sample(count_1, replace=True)
over_df = pd.concat([df_0_over, df_1, df_2_over], axis=0)
# I expect this to be bigger, count1*3 = 10800
over_df.shape

# so, BMAC didn't change, but we also made a mistake
# we should be oversampling on the test set too
# what to expect? if we get better will BMAC reward us for well handled class imbalance?
# Does NB classifier even care about sample repetitions? 
# I think yes

Exception: do the oversampling after train/test split

In [7]:
#Normalizing and preparing feature and label matrices
nrm_df = mean_std_normalise(bare_df.drop(columns='y'))
nrm_df['y'] = bare_df['y']

# dividing X, y into train and test data 
df_train, df_test = train_test_split(nrm_df)


In [8]:
#Oversampling the training set only
# Class count
count_1, count_2, count_0 = df_train['y'].value_counts()

# Divide by class
df_0 = df_train[df_train['y'] == 0]
df_1 = df_train[df_train['y'] == 1]
df_2 = df_train[df_train['y'] == 2]
# Let's try balancing out the classes with over-sampling
df_0_over = df_0.sample(count_1, replace=True)
df_2_over = df_2.sample(count_1, replace=True)
over_df = pd.concat([df_0_over, df_1, df_2_over], axis=0)
# I expect this to be bigger, count1*3 = 10800
over_df.shape

# so, BMAC didn't change, but we also made a mistake
# we should be oversampling on the test set too
# what to expect? if we get better will BMAC reward us for well handled class imbalance?
# Does NB classifier even care about sample repetitions? 
# I think yes

(8124, 1001)

In [9]:
# ONE HOTS
X_train = df_train.drop(columns='y').values
y_train = oh(df_train['y'].values)
n_train = len(y_train)

X_test = df_test.drop(columns='y').values
y_test = oh(df_test['y'].values)
n_test = len(y_test)
print("train shape =", np.shape(X_train),'to',np.shape(y_train))
print("test shape  =", np.shape(X_test), 'to',np.shape(y_test))
print()
print('n0test = %.1f%%'%(100/n_test*np.sum(y_test[:,0])))
print('n1test = %.1f%%'%(100/n_test*np.sum(y_test[:,1])))
print('n2test = %.1f%%'%(100/n_test*np.sum(y_test[:,2])))
print()
print('n0train = %.1f%%'%(100/n_train*np.sum(y_train[:,0])))
print('n1train = %.1f%%'%(100/n_train*np.sum(y_train[:,1])))
print('n2train = %.1f%%'%(100/n_train*np.sum(y_train[:,2])))
print()

train shape = (3600, 1000) to (3600, 3)
test shape  = (1200, 1000) to (1200, 3)

n0test = 13.2%
n1test = 74.3%
n2test = 12.4%

n0train = 12.2%
n1train = 75.2%
n2train = 12.5%



In [10]:
X_train = nrm_df.drop(columns='y').values
y_train = nrm_df['y'].values
n_train = len(y_train)

X_test = df_test.drop(columns='y').values
y_test = df_test['y'].values
n_test = len(y_test)
print("train shape = ", np.shape(X_train))
print("test shape  = ", np.shape(X_test))
print()
print('n0test = %.1f%%'%(100/n_test*np.sum((y_test == 0))))
print('n1test = %.1f%%'%(100/n_test*np.sum((y_test == 1))))
print('n2test = %.1f%%'%(100/n_test*np.sum((y_test == 2))))
print()
print('n0train = %.1f%%'%(100/n_train*np.sum((y_train == 0))))
print('n1train = %.1f%%'%(100/n_train*np.sum((y_train == 1))))
print('n2train = %.1f%%'%(100/n_train*np.sum((y_train == 2))))
print()

train shape =  (4800, 1000)
test shape  =  (1200, 1000)

n0test = 13.2%
n1test = 74.3%
n2test = 12.4%

n0train = 12.5%
n1train = 75.0%
n2train = 12.5%



In [28]:
# training a DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier(presort = True, max_depth = 8, class_weights=(0.125,0.75,0.125)).fit(X_train, y_train) 

TypeError: __init__() got an unexpected keyword argument 'class_weights'

In [11]:
# training a linear SVM classifier 
from sklearn.svm import SVC 
svm_model_linear = SVC(gamma='scale',class_weight={0:1/.125, 1:1/.75, 2:1/.125}, probability = True, kernel = 'rbf', C = 1).fit(X_train, y_train) 

In [None]:
# training a KNN classifier 
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 7).fit(X_train, y_train)

In [None]:
# training a Naive Bayes classifier 
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, y_train) 

In [45]:
dtree_predictions = dtree_model.predict(X_test) 
dtree_train_pred  = dtree_model.predict(X_train)
BMAC = balanced_accuracy_score(y_test, dtree_predictions)
tsBMAC = balanced_accuracy_score(y_train, dtree_train_pred )
print('BMAC =',BMAC)
print('trainset BMAC =', tsBMAC)

NameError: name 'dtree_model' is not defined

In [12]:
svm_predictions = svm_model_linear.predict(X_test)
svm_train_pred  = svm_model_linear.predict(X_train)
BMAC = balanced_accuracy_score(y_test, svm_predictions)
tsBMAC = balanced_accuracy_score(y_train, svm_train_pred)
print('BMAC =',BMAC)
print('trainset BMAC =', tsBMAC)

BMAC = 0.9801943198804185
trainset BMAC = 0.9798148148148149


In [26]:
svm_predictions = svm_model_linear.predict(X_test)
svm_train_pred  = svm_model_linear.predict(X_train)
BMAC = balanced_accuracy_score(y_test, svm_predictions)
tsBMAC = balanced_accuracy_score(y_train, svm_train_pred)
print('BMAC =',BMAC)
print('trainset BMAC =', tsBMAC)
# SVC with RBF kernel, oversampled, prob = true

BMAC = 0.7065175567113098
trainset BMAC = 0.9743873907154291


In [23]:
svm_predictions = svm_model_linear.predict(X_test)
svm_train_pred  = svm_model_linear.predict(X_train)
BMAC = balanced_accuracy_score(y_test, svm_predictions)
tsBMAC = balanced_accuracy_score(y_train, svm_train_pred)
print('BMAC =',BMAC)
print('trainset BMAC =', tsBMAC)
# SVC with RBF kernel, imbalanced, prob = true

BMAC = 0.5670984409081278
trainset BMAC = 0.705071269823505


In [None]:
knn_predictions = knn.predict(X_test) 
knn_train_pred  = knn.predict(X_train) 
BMAC = balanced_accuracy_score(y_test, knn_predictions)
tsBMAC = balanced_accuracy_score(y_train, knn_train_pred)
print('BMAC =',BMAC)
print('trainset BMAC =', tsBMAC)

In [None]:
gnb_predictions = gnb.predict(X_test) 
gnb_train_pred  = gnb.predict(X_train)
BMAC = balanced_accuracy_score(y_test, gnb_predictions)
tsBMAC = balanced_accuracy_score(y_train, gnb_train_pred)
print('BMAC =',BMAC)
print('trainset BMAC =', tsBMAC)

In [None]:
dtree_proba       = dtree_model.predict_proba(X_test)
svm_proba         = svm_model_linear.predict_proba(X_test)
knn_proba         = knn.predict_proba(X_test)
gnb_proba         = gnb.predict_proba(X_test)
dtree_train_proba = dtree_model.predict_proba(X_train)
svm_train_proba   = svm_model_linear.predict_proba(X_train)
knn_train_proba   = knn.predict_proba(X_train)
gnb_train_proba   = gnb.predict_proba(X_train)


ensemble_pred = dtree_proba      +svm_proba      +knn_proba      +gnb_proba
ensemble_train_pred = dtree_train_proba+svm_train_proba+knn_train_proba+gnb_train_proba
print(bmac(oh(y_test,ensemble_pred)))
print(bmac(oh(y_train,ensemble_train_pred)))

In [None]:
#histograms show that NBClassifier is always very confident
proba = gnb.predict_proba(X_train)
y_test_oh = np.zeros((y_test.size, y_test.max()+1))
y_test_oh[np.arange(y_test.size),y_test] = 1
y_train_oh = np.zeros((y_train.size, y_train.max()+1))
y_train_oh[np.arange(y_train.size),y_train] = 1
print(proba - y_train_oh)

#plt.hist(proba-y_test_oh)


#plt.hist(y_train)
gnb.score(X_test, y_test)

In [49]:
X_graded = mean_std_normalise(test_df).values
y_graded = svm_model_linear.predict(X_graded)

In [37]:
print(y_graded)

[1 0 1 ... 1 0 1]


In [40]:
y_out_template = np.genfromtxt ('sample.csv', delimiter=",")[1:]

11.12.00.48_seb_vanilla_svm_rbf.csv


In [50]:
filename = datetime.now().strftime("%m.%d.%H.%M")+"_seb_vanilla_svm_rbf.csv"
print(filename)
y_out=y_out_template
y_out[:,1] = y_graded
np.savetxt(filename, y_out, delimiter=",",header="id,y",  comments='')

11.12.01.04_seb_vanilla_svm_rbf.csv


In [None]:
### Ignore BELOW here

In [None]:

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (7,5)

In [None]:
train_df.plot.scatter(x='x0', y='x1', c='y',colormap='viridis')
plt.show()

ok, so we have tons of dimensions and I want to get the useful ones.

PCA seems like a good idea, but remember that you have imbalanced classes!
Intuitively for me imbalanced classes will also imbalance PCA.
Does PCA make sense for classification? YES, but not for multiclass, as there is no induced ordering.

We have to take a 1-vs-X approach ALREADY...

To make things simple for my self I will start with

### PCA 1 vs 1 (between the two balanced classes, y =  0 and y = 2)

In [None]:
full_02 = train_df[train_df['y'] != 1]
print(full_02.shape)

In [None]:
def mean_std_normalise(df):
    return (df-df.mean())/df.std()

def min_max_normalise(df):
    return (2*df-df.min()-df.max())/(df.max()-df.min())

In [None]:
full_02 = mean_std_normalise(full_02)
full_02['y'] = min_max_normalise(full_02['y'])
full_02.describe()

In [None]:
X = full_02.drop(columns = 'y').values
y = full_02['y'].values
print('shape(X) = ', np.shape(X))
print('shape(y) = ', np.shape(y))

In [None]:
# PCA seems like a good idea, but remember that you have imbalanced classes!
# Intuitively for me imbalanced classes will also imbalance PCA.

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(X)
princX = pca.transform(X)
plt.scatter(princX[:, 0], princX[:, 1], c=y, cmap=plt.cm.nipy_spectral,
           edgecolor='k')
plt.show()

In [None]:
from sklearn.cross_decomposition import CCA
princX = CCA(n_components=2).fit(X, y).transform(X)
print(princX)
plt.scatter(princX[:, 0], princX[:, 1], c=y)
plt.show()

In [None]:
import numpy as np
#from fancyimpute import simple_fill
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score


In [None]:
### 1. normalise

#X_in = X_imp;
#X_out = Xt_imp;
X_tot = X_imp
#X_tot = np.concatenate((X_in, X_out), axis=0)
scaler = StandardScaler()
X_tot = scaler.fit_transform(X_tot)
X_in = X_tot[:X_in.shape[0],:]
X_out = X_tot[X_in.shape[0]:,:]
print(X_in.shape)
print(X_out.shape)

In [None]:
import datetime
print(datetime.datetime.now())