In [1]:
!git clone https://github.com/splendidcomputer/cora_project

Cloning into 'cora_project'...
remote: Enumerating objects: 240, done.[K
remote: Counting objects: 100% (150/150), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 240 (delta 94), reused 119 (delta 74), pack-reused 90[K
Receiving objects: 100% (240/240), 422.75 KiB | 5.87 MiB/s, done.
Resolving deltas: 100% (146/146), done.


In [2]:
import pandas as pd
import numpy as np

from keras.utils import np_utils

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from xgboost import XGBClassifier as XGB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.naive_bayes import MultinomialNB as MNB

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score



In [3]:
cora = pd.read_csv("./cora_project/cora.content", delimiter="\t", header=None)
y_labels = ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]

for i,label in enumerate(y_labels):
    cora[1434][cora[1434]==label] = i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cora[1434][cora[1434]==label] = i


In [4]:
X = np.asarray(cora.iloc[:,1:-1]).astype('float32')
y = np.asarray(cora.iloc[:,-1]).astype('float32')

In [5]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

In [6]:
list_models = [RF(), LDA(), QDA(), XGB(), SVC(kernel="linear"), SVC(kernel="rbf"), KNN(), LR(), GNB(), MNB()]

In [7]:
list_models_name = ["RF", "LDA", "QDA", "XGB", "SVM_Linear", "SVM_RBF", "KNN", "LR", "GNB", "MNB"]

In [None]:
kFold = StratifiedKFold(n_splits=10)
list_all_models_predictions = []
total_model_scores = []
for i,model in enumerate(list_models):
  model_predictions = []
  model_score = []
  counter = 1
  for train, test in kFold.split(X, encoded_Y):
    print("   Fold " + str(counter))
    counter+=1
    model.fit(X[train], encoded_Y[train])
    model_predictions.append(model.predict(X[test]))
    model_score.append([accuracy_score(encoded_Y[train],model.predict(X[train])), accuracy_score(encoded_Y[test],model.predict(X[test]))])

  list_all_models_predictions.append(model_predictions)
  total_model_scores.append(model_score)

  print("Model " + str(i+1) + " Trained!")

   Fold 1
   Fold 2
   Fold 3
   Fold 4
   Fold 5
   Fold 6
   Fold 7
   Fold 8
   Fold 9
   Fold 10
Model 1 Trained!
   Fold 1
   Fold 2
   Fold 3
   Fold 4
   Fold 5
   Fold 6
   Fold 7
   Fold 8
   Fold 9
   Fold 10
Model 2 Trained!
   Fold 1




   Fold 2




   Fold 3




   Fold 4




   Fold 5




   Fold 6




   Fold 7




   Fold 8




   Fold 9




   Fold 10




Model 3 Trained!
   Fold 1
   Fold 2
   Fold 3
   Fold 4
   Fold 5
   Fold 6
   Fold 7
   Fold 8
   Fold 9
   Fold 10


In [None]:
counter = 0
for list_predictions in list_all_models_predictions:
  final_predictions = []
  for fold_prediction in list_predictions:
    for item in fold_prediction:
      final_predictions.append(y_labels[item])
  list_ids = list(cora.iloc[:,0])
  d = {'IDs': list_ids, 'Predictions': final_predictions}
  df = pd.DataFrame(data=d)
  df.to_csv(list_models_name[counter] + "_final_predictions.tsv", sep="\t")
  counter+=1

In [None]:
counter = 0
d = {'Model': [], 'Fold_1': [], 'Fold_2': [], 'Fold_3': [], 'Fold_4': [], 'Fold_5': [], 'Fold_6': [], 'Fold_7': [], 'Fold_8': [], 'Fold_9': [], 'Fold_10': [],'Average': []}
df_test = pd.DataFrame(data=d)
df_train = pd.DataFrame(data=d)
for fold_scores in total_model_scores:
  fold_val_score_list_test = [list_models_name[counter]]
  fold_val_score_list_train = [list_models_name[counter]]
  for fold_score in fold_scores:
    fold_val_score_list_test.append(fold_score[1])
    fold_val_score_list_train.append(fold_score[0])
  fold_val_score_list_test.append(np.mean(np.array(fold_scores), axis=0)[1])
  fold_val_score_list_train.append(np.mean(np.array(fold_scores), axis=0)[0])
  # print(fold_val_score_list)
  df_test.loc[len(df_test)] = fold_val_score_list_test
  df_train.loc[len(df_train)] = fold_val_score_list_train
  counter+=1

In [None]:
df_test

In [None]:
df_train

In [None]:
df_train.to_csv("ML_Train_ACC.tsv", sep="\t")
df_test.to_csv("ML_Test_ACC.tsv", sep="\t")