In [None]:
# Library Import Section
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
from sklearn.neural_network import MLPClassifier

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files 
files.upload()
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle (1).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
!kaggle competitions download -c rki-competition-01

rki-competition-01.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip ./rki-competition-01.zip

Archive:  ./rki-competition-01.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: sample_submission.csv   
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv                
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv               


In [None]:
datafile = "./train.csv"
df = pd.read_csv(datafile)
df.drop(df.columns[0],axis=1,inplace=True)

datafile2 = "./test.csv"
df2 = pd.read_csv(datafile2)
df2.drop(df2.columns[0],axis=1,inplace=True)

In [None]:
# Here the three variables that will be used to find y_test get defined
X_train = df.drop('Label',axis=1)
y_train = df['Label']
X_test = df2

In [None]:
# Here information is given about the training and testing set
n_samples_train, n_features = X_train.shape
n_samples_test, _ = X_test.shape
n_classes = len(np.unique(y_train))

print("Number of samples in training set: %d " % 
      (n_samples_train))
print("Number of samples in the testing set: %d " % 
      (n_samples_test))
print("Number of features: " +  str(n_features))
print("Number of classes: " + str(n_classes))
print("IDs for class labels: " + str(np.unique(y_train)))

Number of samples in training set: 1983 
Number of samples in the testing set: 496 
Number of features: 988
Number of classes: 3
IDs for class labels: [0 1 2]


In [None]:
# Here I apply Principal Component Analysis for dimensionality reduction in order to get higher accuracy
pca = PCA(n_components=.69)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

pca.fit(X_train_std)

X_train_proj_sklearn = sc.transform(X_train)
X_train2 = pca.transform(X_train_proj_sklearn)
print("original X_train shape:   ", X_train.shape)
print("transformed X_train shape:", X_train2.shape)

X_test_proj_sklearn = sc.transform(X_test)
X_test2 = pca.transform(X_test_proj_sklearn)
print("original X_test shape:   ", X_test.shape)
print("transformed X_test shape:", X_test2.shape)

original X_train shape:    (1983, 988)
transformed X_train shape: (1983, 74)
original X_test shape:    (496, 988)
transformed X_test shape: (496, 74)


In [None]:
# Here I define the model using the MLPClassifier and fit it to the dimensionality reduced training set
# I store the predicted values from the model for the test set in the y_pred variable
# I also use cross validation for my model on the training set in order to get an estimation of how my model will perform on the test set
model = MLPClassifier(hidden_layer_sizes=300, alpha=.5)

model.fit(X_train2,y_train)
y_pred = model.predict(X_test2)

cv = StratifiedKFold(n_splits=10, random_state=43, shuffle=True)

cv_acc = cross_val_score(estimator=model, X=X_train2, y=y_train, cv=cv, n_jobs=-1)

print("Cross-validation: {:.5f}".format(np.mean(cv_acc)))



Cross-validation: 0.98083


In [None]:
# The predicted values for the test set
y_pred

array([2, 2, 2, 1, 0, 2, 2, 0, 0, 0, 1, 1, 2, 0, 2, 1, 1, 0, 0, 0, 2, 0,
       1, 2, 1, 0, 1, 0, 0, 1, 2, 0, 2, 1, 1, 2, 1, 1, 2, 0, 2, 0, 0, 2,
       2, 2, 2, 1, 2, 0, 0, 2, 1, 1, 0, 1, 1, 2, 2, 1, 1, 0, 2, 2, 0, 0,
       2, 1, 2, 1, 1, 2, 0, 0, 1, 2, 1, 2, 0, 0, 2, 2, 2, 2, 1, 2, 2, 2,
       0, 2, 1, 2, 2, 0, 0, 1, 1, 2, 0, 2, 0, 0, 1, 1, 1, 2, 0, 0, 1, 2,
       1, 0, 0, 2, 1, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 2, 1, 0, 0, 2,
       1, 2, 1, 0, 2, 2, 0, 2, 1, 2, 0, 1, 2, 2, 2, 0, 0, 1, 2, 2, 1, 1,
       0, 1, 2, 0, 2, 1, 0, 2, 2, 0, 1, 0, 0, 2, 0, 1, 1, 1, 1, 0, 2, 2,
       0, 0, 2, 0, 1, 0, 0, 0, 2, 1, 0, 0, 2, 2, 1, 0, 2, 1, 2, 1, 2, 1,
       2, 1, 2, 1, 2, 1, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 1, 2, 0, 1, 0,
       2, 0, 1, 1, 1, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 1, 1, 2, 2, 0, 1, 0,
       1, 2, 0, 1, 0, 1, 0, 2, 0, 2, 1, 2, 0, 0, 1, 2, 1, 2, 2, 2, 0, 0,
       2, 1, 2, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 2, 2, 1, 0, 2, 1, 1, 1, 1,
       2, 2, 0, 2, 1, 0, 1, 1, 2, 1, 1, 2, 1, 0, 2,

In [None]:
# Here I generate a csv with the id column and label column storing the y_pred values
datafile3 = "./sample_submission.csv"
df3 = pd.read_csv(datafile3)

df3 = df3.drop('Label',axis=1)

df3['Label'] = y_pred

df3.to_csv('submission.csv', index=False)