In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
import sklearn.preprocessing as pre # need OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA # Use to check feature importance?
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.neighbors import NearestNeighbors
#from sklearn.model_selection import GridSearchCV
from sktime.datasets import load_osuleaf
from tslearn.metrics import dtw
from sklearn.neighbors import KNeighborsClassifier # This is only used in part 2, since it's not allowed in part 1.

## First let's Prepare the data

In [2]:
df = pd.read_csv('cleveland.csv').replace({'ca': '?', 'thal': '?'},{'ca': '0.0', 'thal': '3.0'}).astype(float, errors='ignore')
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1.0
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2.0
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3.0
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1.0


In [3]:
# categories: sex (1,0), cp (1,2,3,4), fbs (1,0), exang (1,0), thal (3,6,7), num (0,{1,2,3,4})

In [4]:
# Change num to either 0 or 1.

df['num'] = [0.0 if df['num'][i] == 0.0 else 1.0 for i in range(df.shape[0])]

In [5]:
df2 = pd.get_dummies(df, columns=['sex','cp','fbs','exang','thal'])

In [6]:
# create the X and y pairs.
Xdf = df2.drop(columns=['num'])
Ydf = df['num']
X = Xdf.to_numpy()
y = Ydf.to_numpy()

In [7]:
# Prepare to standardize the data.
myscaler = pre.StandardScaler(copy=False)

In [8]:
# scale the data
myscaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
# What are the more important features?
pca1 = PCA(n_components=X_train.shape[1])

In [10]:
pca1.fit(X_train)

PCA(n_components=21)

In [11]:
pca1.singular_values_

array([3.20079218e+01, 2.29636027e+01, 2.23143922e+01, 1.88413585e+01,
       1.77355047e+01, 1.71818558e+01, 1.62948592e+01, 1.55678708e+01,
       1.46068733e+01, 1.42664357e+01, 1.38081854e+01, 1.30447824e+01,
       1.24200321e+01, 1.15874077e+01, 9.43488566e+00, 8.88520073e+00,
       5.29431176e-15, 4.21042736e-15, 3.44436481e-15, 2.91416101e-15,
       2.35420553e-15])

In [12]:
# Here's how we'll predict using NearestNeighbors.
def kPredict(indices,y):
    ypreds = []
    for mylist in indices:
        preds = [y[i] for i in mylist]
        pred = max(set(preds), key=preds.count)
        ypreds += [pred]
        
    return ypreds

In [13]:
# Here is our monte carlo cross validation.
def myCV(Xdf,Ydf,k,p,cv=10):
    # already perfomed 1-hot encoding, gotten rid of '?' and converted to float.
    X = Xdf.to_numpy()
    y = Ydf.to_numpy()
    myscaler = pre.StandardScaler(copy=False)
    myscaler.fit_transform(X)
    scores=[]
    for v in range(cv):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/cv)
        pca = PCA(n_components=p)
        pca.fit(X_train) # fit according to the training data
        Xtr = pca.transform(X_train) # transform the training data
        Xte = pca.transform(X_test) # transform the test data for later use.
        knn = NearestNeighbors(n_neighbors=k)
        knn.fit(Xtr, y_train)
        dists, neighs = knn.kneighbors(Xte) # find the test sets' nearest neighbors in the training set.
        preds = kPredict(neighs,y_train)
        #PrecisionScore = precision_score(preds,y_test)
        #RecallScore = recall_score(preds,y_test)
        F1Score = f1_score(preds,y_test) # compute the f1 score.
        #AccuracyScoreScore = accuracy_score(preds,y_test)
        scores += [F1Score]
        
    avg_score = sum(scores)/len(scores)
    return [k,p,avg_score]

In [14]:
# Test it:
myCV(Xdf,Ydf,5,5)

[5, 5, 0.7417161040610198]

In [15]:
# Let's see how many principal components and number "k" is ideal.
def myGridSearch(Xdf,Ydf,list1,list2,cv=10):
    output = []
    for k in list1:
        for p in list2:
            output += [myCV(Xdf,Ydf,k,p,cv=cv)]
            
    return output

In [16]:
#We'll run our grid search three times to see if one comes out on top consistently.

In [17]:
myGridSearch(Xdf,Ydf,[3,5,7],[3,5,7])

[[3, 3, 0.7943788778834417],
 [3, 5, 0.7338794235002013],
 [3, 7, 0.7658991157558407],
 [5, 3, 0.7834670076088841],
 [5, 5, 0.7633071431605913],
 [5, 7, 0.8011946694255541],
 [7, 3, 0.7900075034171143],
 [7, 5, 0.7868963010424112],
 [7, 7, 0.7497906403940886]]

In [18]:
myGridSearch(Xdf,Ydf,[3,5,7],[3,5,7])

[[3, 3, 0.7880995163531315],
 [3, 5, 0.7097583708262241],
 [3, 7, 0.7525971400164949],
 [5, 3, 0.7498981099164309],
 [5, 5, 0.7658084843561517],
 [5, 7, 0.8155179228567533],
 [7, 3, 0.7419556996393577],
 [7, 5, 0.832059675225693],
 [7, 7, 0.8127521236400383]]

In [19]:
myGridSearch(Xdf,Ydf,[3,5,7],[3,5,7])

[[3, 3, 0.7409332238642584],
 [3, 5, 0.8217372346477546],
 [3, 7, 0.7555031611178538],
 [5, 3, 0.7925694617971855],
 [5, 5, 0.7545959008717629],
 [5, 7, 0.7773345844669374],
 [7, 3, 0.7727429052429052],
 [7, 5, 0.7923869855096155],
 [7, 7, 0.8063446081116858]]

In [20]:
# It looks like we should choose k=7 and p=5.

## Now let's do a test run for the in-class competition

In [21]:
def myEval(name,Xdf,Ydf,k,p):
    dfTest = pd.read_csv(name).replace({'ca': '?', 'thal': '?'},{'ca': '0.0', 'thal': '3.0'}).astype(float, errors='ignore') # read the data
    dfTest = dfTest.drop(columns=['Unnamed: 0']) # not even sure what this column is doing here.
    dfTest.iloc[-1] = df.iloc[0] # this makes it so the thal value of 6.0 is included.
    df2 = pd.get_dummies(dfTest, columns=['sex','cp','fbs','exang','thal']) # one-hot encode.
    df2 = df2.drop([df2.shape[0]-1]) # take out what was artificially inserted.
    Xdftest = df2.drop(columns=['disease']) # inputs
    #return Xdf
    #return Xdftest
    Ydftest = df2['disease'] # outputs
    #Xtestp = Xdftest.to_numpy() # input to numpy
    #ytestp = Ydftest.to_numpy() # outputs to numpy
    myscaler = pre.StandardScaler(copy=False) # prepare to standardize. We'll need to standardize the train and test
    bigXdf = pd.concat([Xdf,Xdftest]) # temporarily merge the two for standardization.
    #return bigXdf
    bigX = bigXdf.to_numpy()
    #return bigX
    myscaler.fit_transform(bigX)
    #return bigX.shape
    Xtrain = bigX[:Xdf.shape[0]]
    Xtest = bigX[Xdf.shape[0]:]
    Ytrain = Ydf.to_numpy()
    Ytest = Ydftest.to_numpy()
    
    pca = PCA(n_components=p)
    pca.fit(Xtrain)
    Xtr = pca.transform(Xtrain) # transform the training data
    Xte = pca.transform(Xtest) # transform the test data for later use.
    
    knn = NearestNeighbors(n_neighbors=k)
    knn.fit(Xtr, Ytrain)
    dists, neighs = knn.kneighbors(Xte) # find the test sets' nearest neighbors in the training set.
    preds = kPredict(neighs,Ytrain)
    F1Score = f1_score(preds,Ytest)
    
    return F1Score

In [22]:
myscore = myEval("cleveland-test-sample.csv",Xdf,Ydf,7,5)
myscore

0.918918918918919

## Part 2

In [23]:
Xl, yl = load_osuleaf(return_X_y=True)

In [24]:
Xl2=Xl.to_numpy()

In [25]:
Xl_train, Xl_test, yl_train, yl_test = train_test_split(Xl2, yl, test_size=0.25, random_state=42)

In [26]:
Xl_train = np.array([list(Xl_train[i][0]) for i in range(len(Xl_train))])
Xl_test = np.array([list(Xl_test[i][0]) for i in range(len(Xl_test))])

In [27]:
Lknn = KNeighborsClassifier(n_neighbors=5, metric=dtw)

In [28]:
Lknn.fit(Xl_train, yl_train)

KNeighborsClassifier(metric=<function dtw at 0x7f1b67fe8040>)

In [29]:
Lknn.score(Xl_test, yl_test)

0.6486486486486487

## Results of in-class competition

In [31]:
myscore2 = myEval("cleveland-challenge.csv",Xdf,Ydf,7,5)
myscore2

0.8205128205128205