# Parkinson’s data cross-validation

In [53]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [54]:
# into a dataframe object
df = pd.DataFrame(pd.read_csv("dataset1.csv")) 

In [55]:
df.head()

Unnamed: 0,rand1,rand2,rand3,class
0,0.322113,9.937013,-1.0134,classA
1,0.859447,2.423768,-1.312457,classB
2,0.086464,9.181621,-1.088611,classA
3,0.502293,2.817131,-1.081987,classB
4,0.93065,4.359159,-1.676343,classB


In [56]:
#creating labelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
label_encoded=le.fit_transform(df['class'])
df['class']= (label_encoded)

In [57]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Kappa Score: ",cohen_kappa_score(y_test, y_pred))
print("% Correct: ",accuracy_score(y_test, y_pred))
print("Precision Score: ",precision_score(y_test, y_pred))
print("Recall Score:",recall_score(y_test, y_pred))

Kappa Score:  0.12395709177592373
% Correct:  0.6
Precision Score:  0.7014925373134329
Recall Score: 0.6811594202898551


In [58]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(min_samples_split = 10)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Kappa Score: ",cohen_kappa_score(y_test, y_pred))
print("% Correct: ",accuracy_score(y_test, y_pred))
print("Precision Score: ",precision_score(y_test, y_pred))
print("Recall Score:",recall_score(y_test, y_pred))

Kappa Score:  0.12797281993204979
% Correct:  0.580952380952381
Precision Score:  0.711864406779661
Recall Score: 0.6086956521739131


The new performance is certainly worse than previous one when we compare the Kappa score, recall and the accuracy

In [59]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
model2 = DecisionTreeClassifier()
kf = KFold(n_splits=4, random_state=1, shuffle=True)
count=0
for train_index, test_index in kf.split(X):
    count=count+1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model2.fit(X_train, y_train)
    y_pred=model2.predict(X_test)
    print("Fold:",count)
    print("Kappa Score: ",cohen_kappa_score(y_test, y_pred))
    print("% Correct: ",accuracy_score(y_test, y_pred))
    print("Precision Score: ",precision_score(y_test, y_pred))
    print("Recall Score:",recall_score(y_test, y_pred))
    print("--")

Fold: 1
Kappa Score:  -0.05660377358490565
% Correct:  0.5227272727272727
Precision Score:  0.6727272727272727
Recall Score: 0.6065573770491803
--
Fold: 2
Kappa Score:  -0.07013574660633481
% Correct:  0.5113636363636364
Precision Score:  0.5692307692307692
Recall Score: 0.7115384615384616
--
Fold: 3
Kappa Score:  -0.23166023166023164
% Correct:  0.4942528735632184
Precision Score:  0.6666666666666666
Recall Score: 0.625
--
Fold: 4
Kappa Score:  0.046222222222222276
% Correct:  0.5747126436781609
Precision Score:  0.7407407407407407
Recall Score: 0.6349206349206349
--


4 fold cross validation is giving the worst performance as compared to the previous once, hence the model performance decreased significantly.If I want to improve the accuracy, there are several ways to try to improve, for example:
a) get more data/better data
b) try other classifiers - svm, random forest, etc
c) try which combinations of features seems to work best
d) generate better features

2: Cross-validating at the group level

In [60]:
# into a dataframe object 
df2 = pd.DataFrame(pd.read_csv("dataset2.csv")) 
#creating labelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
label_encoded=le.fit_transform(df2['class'])
df2['class']= (label_encoded)

In [61]:
df2 = df2.drop(columns=['group_id'])

In [62]:
df2.head()

Unnamed: 0,rand1,rand2,class
0,0.407873,1.03736,1
1,0.763715,-1.033221,0
2,0.85457,5.388808,1
3,0.553711,-3.11861,0
4,0.336519,-1.849318,0


In [63]:
df2.head()

Unnamed: 0,rand1,rand2,class
0,0.407873,1.03736,1
1,0.763715,-1.033221,0
2,0.85457,5.388808,1
3,0.553711,-3.11861,0
4,0.336519,-1.849318,0


In [64]:
X = df2.iloc[:, :-1].values
y = df2.iloc[:, -1].values
model2 = DecisionTreeClassifier()
kf = KFold(n_splits=4, random_state=1, shuffle=True)
count=0
for train_index, test_index in kf.split(X):
    count=count+1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model2.fit(X_train, y_train)
    y_pred=model2.predict(X_test)
    print("Fold:",count)
    print("Kappa Score: ",cohen_kappa_score(y_test, y_pred))
    print("% Correct: ",accuracy_score(y_test, y_pred))
    print("Precision Score: ",precision_score(y_test, y_pred))
    print("Recall Score:",recall_score(y_test, y_pred))
    print("--")

Fold: 1
Kappa Score:  0.8407643312101911
% Correct:  0.92
Precision Score:  1.0
Recall Score: 0.8571428571428571
--
Fold: 2
Kappa Score:  0.13793103448275856
% Correct:  0.6
Precision Score:  0.5
Recall Score: 0.4
--
Fold: 3
Kappa Score:  0.6688741721854304
% Correct:  0.84
Precision Score:  0.8125
Recall Score: 0.9285714285714286
--
Fold: 4
Kappa Score:  0.9201277955271565
% Correct:  0.96
Precision Score:  0.9230769230769231
Recall Score: 1.0
--


In [65]:
# into a dataframe object 
df2_1 = pd.DataFrame(pd.read_csv("dataset2.csv")) 
#creating labelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
label_encoded=le.fit_transform(df2_1['class'])
df2_1['class']= (label_encoded)

In [66]:
df2_1.head()

Unnamed: 0,group_id,rand1,rand2,class
0,group6,0.407873,1.03736,1
1,group3,0.763715,-1.033221,0
2,group10,0.85457,5.388808,1
3,group1,0.553711,-3.11861,0
4,group3,0.336519,-1.849318,0


In [67]:
array = ['group1','group2','group3','group4','group5','group6','group7']
dd_train=df2_1.loc[df2_1['group_id'].isin(array)]
dd_train= dd_train.drop(columns=['group_id'])

In [68]:
dd_train.head()

Unnamed: 0,rand1,rand2,class
0,0.407873,1.03736,1
1,0.763715,-1.033221,0
3,0.553711,-3.11861,0
4,0.336519,-1.849318,0
5,0.664234,-0.311497,1


In [69]:
array2 = ['group8','group9','group10']
dd_test=df2_1.loc[df2_1['group_id'].isin(array2)]

In [73]:
dd_test.head()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
label_encoded=le.fit_transform(dd_test['group_id'])
dd_test['group_id']= (label_encoded)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [71]:
X_train = dd_train.iloc[:, :-1].values
y_train = dd_train.iloc[:, -1].values
X_test = dd_test.iloc[:, :-1].values
y_test = dd_test.iloc[:, -1].values
model3 = DecisionTreeClassifier()
model3.fit(X_train, y_train)
y_pred=model3.predict(X_test)
print("Fold:",count)
print("Kappa Score: ",cohen_kappa_score(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("% Correct: ",accuracy_score(y_test, y_pred))
print("Precision Score: ",precision_score(y_test, y_pred))
print("Recall Score:",recall_score(y_test, y_pred))
print("--")

ValueError: Number of features of the model must match the input. Model n_features is 2 and input n_features is 3 

In [72]:
X_test = dd_test.iloc[:, :-1].values
X_test

array([[0.        , 0.85456989, 5.38880777],
       [1.        , 0.642096  , 3.15798817],
       [0.        , 0.60294252, 5.67269928],
       [0.        , 0.92659472, 5.96861805],
       [1.        , 0.38807233, 3.14457666],
       [1.        , 0.99442889, 3.08109685],
       [2.        , 0.19467343, 4.05246898],
       [1.        , 0.01207719, 3.33648443],
       [2.        , 0.03529576, 4.251166  ],
       [2.        , 0.62249701, 4.65159461],
       [0.        , 0.9135455 , 5.97148754],
       [2.        , 0.93126139, 4.90587903],
       [2.        , 0.81484056, 4.67791757],
       [1.        , 0.9499174 , 3.43101084],
       [0.        , 0.62287883, 5.13638527],
       [2.        , 0.01236369, 4.98844646],
       [0.        , 0.91108338, 5.16840087],
       [0.        , 0.95754235, 5.3831428 ],
       [1.        , 0.52971316, 3.63669159],
       [1.        , 0.52217852, 3.74414188],
       [1.        , 0.70986259, 3.85125327],
       [1.        , 0.86772212, 3.87220198],
       [0.

The feature space for the train and text is different, in pyhton the feature space needs to be same. The test feature spacfe can be less in dimensions and theere are a few way to balance it to get appropriate to match the dimension size of the train feature set. But unforuntaley is not the other way around, I have researched but I filaed to find a solution. 

3: Custom cross-validation on Parkinson’s data

In [130]:
f=open("parkinsons.names", "r")
print(f.read())

Title: Parkinsons Disease Data Set

Abstract: Oxford Parkinson's Disease Detection Dataset

-----------------------------------------------------	

Data Set Characteristics: Multivariate
Number of Instances: 197
Area: Life
Attribute Characteristics: Real
Number of Attributes: 23
Date Donated: 2008-06-26
Associated Tasks: Classification
Missing Values? N/A

-----------------------------------------------------	

Source:

The dataset was created by Max Little of the University of Oxford, in 
collaboration with the National Centre for Voice and Speech, Denver, 
Colorado, who recorded the speech signals. The original study published the 
feature extraction methods for general voice disorders.

-----------------------------------------------------

Data Set Information:

This dataset is composed of a range of biomedical voice measurements from 
31 people, 23 with Parkinson's disease (PD). Each column in the table is a 
particular voice measure, and each row corresponds one of 195 voice 
rec

In [4]:
dataset = pd.read_csv("parkinsons.data")

In [5]:
y=dataset['status']
cols=['MDVP:RAP','Jitter:DDP','DFA','NHR','MDVP:Fhi(Hz)','name','status']
x=dataset.drop(cols,axis=1)

I have removed all the above columns because after reading through the metadeta I have drop irrelavant column values from our dataset so that we can get better accuracy

In [10]:
x.head()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:PPQ,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,HNR,RPDE,spread1,spread2,D2,PPE
0,119.992,74.997,0.00784,7e-05,0.00554,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,21.033,0.414783,-4.813031,0.266482,2.301442,0.284654
1,122.4,113.819,0.00968,8e-05,0.00696,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,19.085,0.458359,-4.075192,0.33559,2.486855,0.368674
2,116.682,111.555,0.0105,9e-05,0.00781,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,20.651,0.429895,-4.443179,0.311173,2.342259,0.332634
3,116.676,111.366,0.00997,9e-05,0.00698,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,20.644,0.434969,-4.117501,0.334147,2.405554,0.368975
4,116.014,110.655,0.01284,0.00011,0.00908,0.06425,0.584,0.0349,0.04825,0.04465,0.1047,19.649,0.417356,-3.747787,0.234513,2.33218,0.410335


Actually there is no straight answer to the choice of K in k-fold cross validation. An higher k will give you more but smaller subsets on which run testing. An adopted choice is to select the K that gives you a testing set with the size of 15% of your total dataset.

In [44]:
t_size=0.80
test=0.20
seed=245
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=t_size,test_size=test,random_state=seed)

leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
knn_2 = KNeighborsClassifier()
clf = GridSearchCV(knn_2, hyperparameters, cv=10)
best_model = clf.fit(x, y)
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])




Best leaf_size: 1
Best p: 2
Best n_neighbors: 1


In [51]:
n=1
models=[]
models.append(('KNN',KNeighborsClassifier(n_neighbors=n)))
models.append(("Decision_Tree",DecisionTreeClassifier()))
models.append(('Naive Bayes',GaussianNB()))
names=[]
predictions=[]
for name,model in models:
    fold=KFold(n_splits=6,random_state=0)
    result=cross_val_score(model,x_train,y_train,cv=fold)
    predictions.append(result)
    names.append(name)
    msg="%s : %f (%f)"%(name,result.mean(),result.std())
    print(msg)

KNN : 0.891026 (0.068142)
Decision_Tree : 0.865385 (0.053248)
Naive Bayes : 0.717949 (0.079031)


After comparing KNN, Decision tree and Naive Bayes, I got to the conclusion that the best result is given by knn, hence I will be using KNN to do compare it to 4-fold cross-validation

In [52]:
n=1
models=[]
models.append(('KNN',KNeighborsClassifier(n_neighbors=n)))
names=[]
predictions=[]
for name,model in models:
    fold=KFold(n_splits=4,random_state=0)
    result=cross_val_score(model,x_train,y_train,cv=fold)
    predictions.append(result)
    names.append(name)
    msg="%s : %f (%f)"%(name,result.mean(),result.std())
    print(msg)

KNN : 0.903846 (0.052470)


The 4-fold cross-validation is better as comapred to the previously used 6 fold cross validation