#  Importing Required functions

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
Folds = 10
Alpha = 0.01

## Reading the Data File

In [2]:
data = pd.read_csv("C:/Users/STSC/Desktop/HW1.csv")
data

Unnamed: 0,NB,DT,NN
0,0.6809,0.7524,0.7164
1,0.7017,0.8694,0.8883
2,0.7012,0.6803,0.841
3,0.6913,0.9102,0.6825
4,0.6333,0.7758,0.7599
5,0.6415,0.8154,0.8479
6,0.7216,0.6224,0.7012
7,0.7214,0.7585,0.4959
8,0.6578,0.938,0.9279
9,0.7865,0.7524,0.7455


## NB - Naive Bayes, DT - Decision Tree, NN- Nearest Neighbor

In [3]:
for i,val in enumerate(data['NB']):
    data.at[i,'NB'] = 1 - val
for i,val in enumerate(data['DT']):
    data.at[i,'DT'] = 1 - val
for i,val in enumerate(data['NN']):
    data.at[i,'NN'] = 1 - val
data

Unnamed: 0,NB,DT,NN
0,0.3191,0.2476,0.2836
1,0.2983,0.1306,0.1117
2,0.2988,0.3197,0.159
3,0.3087,0.0898,0.3175
4,0.3667,0.2242,0.2401
5,0.3585,0.1846,0.1521
6,0.2784,0.3776,0.2988
7,0.2786,0.2415,0.5041
8,0.3422,0.062,0.0721
9,0.2135,0.2476,0.2545


## Q1: Use ANOVA to determine if the three classifiers have equal error rates.

In [4]:
d_melt = pd.melt(data.reset_index(), id_vars=['index'], value_vars=['NB','DT','NN'])
d_melt.columns = ['index','BetweenGroups','value']
model = ols('value ~ BetweenGroups', data=d_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
BetweenGroups,0.046635,2.0,2.561818,0.095781
Residual,0.245751,27.0,,


##### From the table the P value is .096, which is greater than 0.05. So we accept H0(Null hypothesis) and say the three classifiers have equal errors

## Q2a) Use Cross-Validated Paired t-test to determine if NB and DecTree have equal errors

In [5]:
data[['NB','DT']].describe()

Unnamed: 0,NB,DT
count,10.0,10.0
mean,0.30628,0.21252
std,0.044857,0.098536
min,0.2135,0.062
25%,0.283525,0.1441
50%,0.30375,0.23285
75%,0.336425,0.2476
max,0.3667,0.3776


In [6]:
val = stats.ttest_rel(data['NB'], data['DT'])
print(val)
print('Pvalue: ',val.pvalue)
if val.pvalue < 0.5:
    print("H0: Null Hypothesis Rejected")
else:
    print("H0: Null Hypothesis Accepted")

Ttest_relResult(statistic=2.41980164672186, pvalue=0.03862319304217036)
Pvalue:  0.03862319304217036
H0: Null Hypothesis Rejected


#####  From the result the P value is .039, which is lesser than 0.05. So we reject H0(Null hypothesis) and say there is significant difference between error rates of NB and DT

## Q2b) Use Cross-Validated Paired t-test to determine if DecTree and Knearest Neighbor have equal errors

In [7]:
data[['DT','NN']].describe()

Unnamed: 0,DT,NN
count,10.0,10.0
mean,0.21252,0.23935
std,0.098536,0.124837
min,0.062,0.0721
25%,0.1441,0.153825
50%,0.23285,0.2473
75%,0.2476,0.295
max,0.3776,0.5041


In [8]:
val = stats.ttest_rel(data['DT'], data['NN'])
print(val)
print('Pvalue: ',val.pvalue)
if val.pvalue < 0.5:
    print("H0: Null Hypothesis Rejected")
else:
    print("H0: Null Hypothesis Accepted")

Ttest_relResult(statistic=-0.6599461576035529, pvalue=0.525815214488946)
Pvalue:  0.525815214488946
H0: Null Hypothesis Accepted


##### From the result the P value is .052, which is greater than 0.05. So we accept H0(Null hypothesis) and say there is no significant difference between error rates of DT and NN

## Storing NB,DT,NN values in arrays to perform easy operations later

In [9]:
nb = []
dt = []
nn = []
for i in data.NB:
    nb.append(i)
for i in data.DT:
    dt.append(i)
for i in data.NN:
    nn.append(i)

## Q3)  For each classifier (Naive Bayes, Decision Tree, Knearest Neighbor), determine if the error of the classifier less than p0 (=0.1, 0.2, 0.3) with level of significance (alpha) (=0.01 or 0.025) 

### For Naive Bayes with p0 = 0.1,0.2,0.3 and alpha = 0.025

In [10]:
mean = np.mean(nb)
stdev = np.std(nb)
p0 = 0.1
while p0 < 0.4:
    t1 = np.sqrt(Folds)*(mean-p0)/stdev
    f = Folds - 1
    t = stats.t.ppf(Alpha,f)
    print("For p0: ",round(p0,1))
    print("T-value:",t1)
    print("T-critical:",t)
    if t1 > t:
        print('Error of classifier is less than P0')
        p0 += 0.1
    else:
        print('Error of classifier is greater than P0')
        p0 += 0.1

For p0:  0.1
T-value: 15.328778125485574
T-critical: -2.8214379233005498
Error of classifier is less than P0
For p0:  0.2
T-value: 7.897724157342479
T-critical: -2.8214379233005498
Error of classifier is less than P0
For p0:  0.3
T-value: 0.46667018919938275
T-critical: -2.8214379233005498
Error of classifier is less than P0


### For Decision Tree with p0 = 0.1,0.2,0.3 and alpha = 0.025

In [11]:
mean = np.mean(dt)
stdev = np.std(dt)
p0 = 0.1
while p0 < 0.4:
    t1 = np.sqrt(Folds)*(mean-p0)/stdev
    f = Folds - 1
    t = stats.t.ppf(Alpha,f)
    print("For p0: ",round(p0,1))
    print("T-value:",t1)
    print("T-critical:",t)
    if t1 > t:
        print('Error of classifier is less than P0')
        p0 += 0.1
    else:
        print('Error of classifier is greater than P0')
        p0 += 0.1

For p0:  0.1
T-value: 3.806409099448852
T-critical: -2.8214379233005498
Error of classifier is less than P0
For p0:  0.2
T-value: 0.42353574409082595
T-critical: -2.8214379233005498
Error of classifier is less than P0
For p0:  0.3
T-value: -2.9593376112672014
T-critical: -2.8214379233005498
Error of classifier is greater than P0


### For Nearest Neighborwith p0 = 0.1,0.2,0.3 and alpha = 0.01

In [12]:
mean = np.mean(nn)
stdev = np.std(nn)
p0 = 0.1
while p0 < 0.4:
    t1 = np.sqrt(Folds)*(mean-p0)/stdev
    f = Folds - 1
    t = stats.t.ppf(Alpha,f)
    print("For p0: ",round(p0,1))
    print("T-value:",t1)
    print("T-critical:",t)
    if t1 > t:
        print('Error of classifier is less than P0')
        p0 += 0.1
    else:
        print('Error of classifier is greater than P0')
        p0 += 0.1

For p0:  0.1
T-value: 3.7208557596586833
T-critical: -2.8214379233005498
Error of classifier is less than P0
For p0:  0.2
T-value: 1.050704514837239
T-critical: -2.8214379233005498
Error of classifier is less than P0
For p0:  0.3
T-value: -1.6194467299842061
T-critical: -2.8214379233005498
Error of classifier is less than P0
