In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn import model_selection
from sklearn import metrics
from io import StringIO
import graphviz
from sklearn import naive_bayes

### Load Data

In [2]:
file = pd.read_csv('house-votes-84.data', header=None)
print(file.head())
col_names = "Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,fuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa"


file.to_csv('house-votes-84-data.csv', header=col_names.split(","), index=False)

           0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16
0  republican  n  y  n  y  y  y  n  n  n  y  ?  y  y  y  n  y
1  republican  n  y  n  y  y  y  n  n  n  n  n  y  y  y  n  ?
2    democrat  ?  y  y  ?  y  y  n  n  n  n  y  n  y  y  n  n
3    democrat  n  y  y  n  ?  y  n  n  n  n  y  n  y  n  n  y
4    democrat  y  y  y  n  y  y  n  n  n  n  y  ?  y  y  y  y


In [3]:
df = pd.read_csv('house-votes-84-data.csv')
df.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,fuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


### Understanding Data

In [4]:
df.columns, df.shape

(Index(['Class Name', 'handicapped-infants', 'water-project-cost-sharing',
        'adoption-of-the-budget-resolution', 'physician-fee-freeze',
        'el-salvador-aid', 'religious-groups-in-schools',
        'anti-satellite-test-ban', 'aid-to-nicaraguan-contras', 'mx-missile',
        'immigration', 'fuels-corporation-cutback', 'education-spending',
        'superfund-right-to-sue', 'crime', 'duty-free-exports',
        'export-administration-act-south-africa'],
       dtype='object'),
 (435, 17))

In [5]:
df.isnull().sum()

Class Name                                0
handicapped-infants                       0
water-project-cost-sharing                0
adoption-of-the-budget-resolution         0
physician-fee-freeze                      0
el-salvador-aid                           0
religious-groups-in-schools               0
anti-satellite-test-ban                   0
aid-to-nicaraguan-contras                 0
mx-missile                                0
immigration                               0
fuels-corporation-cutback                 0
education-spending                        0
superfund-right-to-sue                    0
crime                                     0
duty-free-exports                         0
export-administration-act-south-africa    0
dtype: int64

In [6]:
df.dtypes

Class Name                                object
handicapped-infants                       object
water-project-cost-sharing                object
adoption-of-the-budget-resolution         object
physician-fee-freeze                      object
el-salvador-aid                           object
religious-groups-in-schools               object
anti-satellite-test-ban                   object
aid-to-nicaraguan-contras                 object
mx-missile                                object
immigration                               object
fuels-corporation-cutback                 object
education-spending                        object
superfund-right-to-sue                    object
crime                                     object
duty-free-exports                         object
export-administration-act-south-africa    object
dtype: object

In [7]:
df_org = df.copy()

In [8]:
df.isna().sum()

Class Name                                0
handicapped-infants                       0
water-project-cost-sharing                0
adoption-of-the-budget-resolution         0
physician-fee-freeze                      0
el-salvador-aid                           0
religious-groups-in-schools               0
anti-satellite-test-ban                   0
aid-to-nicaraguan-contras                 0
mx-missile                                0
immigration                               0
fuels-corporation-cutback                 0
education-spending                        0
superfund-right-to-sue                    0
crime                                     0
duty-free-exports                         0
export-administration-act-south-africa    0
dtype: int64

### Preprocessing Data

In [9]:
df.replace(('n', 'y'), (-1, 1), inplace=True)
df.replace(('democrat', 'republican'), (1, 0), inplace=True)

In [10]:
df.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,fuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,0,-1,1,-1,1,1,1,-1,-1,-1,1,?,1,1,1,-1,1
1,0,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,1,-1,?
2,1,?,1,1,?,1,1,-1,-1,-1,-1,1,-1,1,1,-1,-1
3,1,-1,1,1,-1,?,1,-1,-1,-1,-1,1,-1,1,-1,-1,1
4,1,1,1,1,-1,1,1,-1,-1,-1,-1,1,?,1,1,1,1


#### i) discard instances that have missing feature values

In [11]:
df1 = df.copy()
df1 = df1[~df1.isin(['?']).any(axis=1)]

In [12]:
df1.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,fuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
5,1,-1,1,1,-1,1,1,-1,-1,-1,-1,-1,-1,1,1,1,1
8,0,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,1,-1,1
19,1,1,1,1,-1,-1,-1,1,1,1,-1,1,-1,-1,-1,1,1
23,1,1,1,1,-1,-1,-1,1,1,1,-1,-1,-1,-1,-1,1,1
25,1,1,-1,1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1,1,1


#### ii) treat “missing” as if it is a value (and thus a binary feature becomes a ternary, or three-valued, feature) : replaced '?' values with 0, 'n' with -1 and 'y' with 1

In [13]:
df2 = df.copy()
df2.replace(('?'), (0), inplace=True)

In [14]:
df2.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,fuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,0,-1,1,-1,1,1,1,-1,-1,-1,1,0,1,1,1,-1,1
1,0,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,1,-1,0
2,1,0,1,1,0,1,1,-1,-1,-1,-1,1,-1,1,1,-1,-1
3,1,-1,1,1,-1,0,1,-1,-1,-1,-1,1,-1,1,-1,-1,1
4,1,1,1,1,-1,1,1,-1,-1,-1,-1,1,0,1,1,1,1


In [15]:
df2.dtypes

Class Name                                int64
handicapped-infants                       int64
water-project-cost-sharing                int64
adoption-of-the-budget-resolution         int64
physician-fee-freeze                      int64
el-salvador-aid                           int64
religious-groups-in-schools               int64
anti-satellite-test-ban                   int64
aid-to-nicaraguan-contras                 int64
mx-missile                                int64
immigration                               int64
fuels-corporation-cutback                 int64
education-spending                        int64
superfund-right-to-sue                    int64
crime                                     int64
duty-free-exports                         int64
export-administration-act-south-africa    int64
dtype: object

In [16]:
df2.describe().round(2)

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,fuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
count,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0
mean,0.61,-0.11,0.01,0.19,-0.16,0.01,0.28,0.13,0.15,0.0,0.01,-0.26,-0.14,0.02,0.18,-0.14,0.48
std,0.49,0.98,0.94,0.97,0.98,0.98,0.95,0.98,0.97,0.98,0.99,0.94,0.95,0.97,0.96,0.96,0.73
min,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
50%,1.0,-1.0,0.0,1.0,-1.0,0.0,1.0,1.0,1.0,0.0,0.0,-1.0,-1.0,0.0,1.0,-1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
df2.corr().round(2)

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,fuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
Class Name,1.0,0.41,-0.0,0.74,-0.91,-0.71,-0.43,0.51,0.65,0.62,-0.08,0.37,-0.69,-0.54,-0.61,0.52,0.22
handicapped-infants,0.41,1.0,0.02,0.4,-0.42,-0.37,-0.4,0.36,0.4,0.34,-0.09,0.11,-0.41,-0.35,-0.37,0.2,0.12
water-project-cost-sharing,-0.0,0.02,1.0,-0.05,0.08,0.13,0.15,-0.2,-0.1,-0.19,-0.12,0.19,-0.02,0.22,-0.02,-0.11,-0.09
adoption-of-the-budget-resolution,0.74,0.4,-0.05,1.0,-0.73,-0.65,-0.43,0.58,0.7,0.6,0.02,0.22,-0.65,-0.53,-0.59,0.48,0.31
physician-fee-freeze,-0.91,-0.42,0.08,-0.73,1.0,0.75,0.48,-0.58,-0.69,-0.64,0.04,-0.28,0.69,0.59,0.65,-0.54,-0.27
el-salvador-aid,-0.71,-0.37,0.13,-0.65,0.75,1.0,0.62,-0.69,-0.83,-0.78,0.01,-0.15,0.63,0.65,0.7,-0.56,-0.27
religious-groups-in-schools,-0.43,-0.4,0.15,-0.43,0.48,0.62,1.0,-0.54,-0.55,-0.57,0.09,0.01,0.51,0.55,0.58,-0.45,-0.2
anti-satellite-test-ban,0.51,0.36,-0.2,0.58,-0.58,-0.69,-0.54,1.0,0.72,0.66,0.03,0.04,-0.54,-0.57,-0.52,0.48,0.35
aid-to-nicaraguan-contras,0.65,0.4,-0.1,0.7,-0.69,-0.83,-0.55,0.72,1.0,0.74,0.03,0.14,-0.62,-0.61,-0.62,0.54,0.32
mx-missile,0.62,0.34,-0.19,0.6,-0.64,-0.78,-0.57,0.66,0.74,1.0,0.03,0.04,-0.59,-0.53,-0.56,0.48,0.29


#### iii) impute missing values (i.e., for each feature, replace missing values with the most common value for that feature), so that they are no longer missing or unknown: replaced '?' with mode of each column

In [18]:
df3 = df.copy()
for col in df3.columns:
    df3[col].replace(("?"), df3[col].mode()[0], inplace=True)

In [19]:
df3.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,fuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,0,-1,1,-1,1,1,1,-1,-1,-1,1,-1,1,1,1,-1,1
1,0,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,1,-1,1
2,1,-1,1,1,-1,1,1,-1,-1,-1,-1,1,-1,1,1,-1,-1
3,1,-1,1,1,-1,1,1,-1,-1,-1,-1,1,-1,1,-1,-1,1
4,1,1,1,1,-1,1,1,-1,-1,-1,-1,1,-1,1,1,1,1


In [20]:
df3.shape

(435, 17)

## DecisionTree Classifier

Implement a decision tree classifier for classification, with each of the above three ways of dealing with missing values  
### i

In [21]:
X = df1.drop(['Class Name'], axis=1)
y = df1['Class Name']
tree_one_enp = tree.DecisionTreeClassifier(criterion='entropy')
tree_one_gini = tree.DecisionTreeClassifier(criterion='gini')

print(tree_one_enp)
print(tree_one_gini)

DecisionTreeClassifier(criterion='entropy')
DecisionTreeClassifier()


In [22]:
tree_one_enp.fit(X, y)
tree_one_gini.fit(X, y)

DecisionTreeClassifier()

In [23]:
#predict
y_pred_enp = tree_one_enp.predict(X)
y_pred_gini = tree_one_gini.predict(X)
print(tree_one_enp.predict(X[-2:]))
print(tree_one_gini.predict(X[-2:]))
print(y[-2:])

[0 1]
[0 1]
430    0
431    1
Name: Class Name, dtype: int64


#### performing 5-fold cross validation and report precision, recall, and F1-scores for the first scenario

In [24]:
# k-fold on entropy crietrion
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df1):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    tree_one_enp.fit(X_train, y_train)
    y_pred = tree_one_enp.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [47 48 49 50 51] TEST: [0 1 2 3 4]
precision=0.9608510638297871
recall=   0.9574468085106383
f1=       0.95740826395313

TRAIN: [0 1 2 3 4] TEST: [47 48 49 50 51]
precision=0.961499493414387
recall=   0.9574468085106383
f1=       0.9577226162332546

TRAIN: [0 1 2 3 4] TEST: [94 95 96 97 98]
precision=0.9790104947526237
recall=   0.9782608695652174
f1=       0.978141004685627

TRAIN: [0 1 2 3 4] TEST: [140 141 142 143 144]
precision=0.9359834368530021
recall=   0.9347826086956522
f1=       0.9349385565466907

TRAIN: [0 1 2 3 4] TEST: [186 187 188 189 190]
precision=0.8922495274102079
recall=   0.8913043478260869
f1=       0.8913557405694317



In [25]:
tree_one_enp_scores = model_selection.cross_validate(tree_one_enp, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
tree_one_enp_scores

{'fit_time': array([0.00177121, 0.00195217, 0.0015831 , 0.00124574, 0.00110793]),
 'score_time': array([0.00277996, 0.002913  , 0.00262189, 0.00192308, 0.00202608]),
 'test_precision_weighted': array([0.96099291, 1.        , 0.97909699, 0.93536789, 0.89224953]),
 'test_recall_weighted': array([0.95744681, 1.        , 0.97826087, 0.93478261, 0.89130435]),
 'test_f1_weighted': array([0.95748535, 1.        , 0.97820889, 0.93462666, 0.89135574])}

In [26]:
for i in tree_one_enp_scores:
    tree_one_enp_scores_ms = {
        i : (f'mean= {np.mean(tree_one_enp_scores[i]).round(3)}, std = {np.std(tree_one_enp_scores[i]).round(4)}')
    }
    print(tree_one_enp_scores_ms)
    

{'fit_time': 'mean= 0.002, std = 0.0003'}
{'score_time': 'mean= 0.002, std = 0.0004'}
{'test_precision_weighted': 'mean= 0.954, std = 0.0373'}
{'test_recall_weighted': 'mean= 0.952, std = 0.0374'}
{'test_f1_weighted': 'mean= 0.952, std = 0.0374'}


In [27]:
# k-fold on gini crietrion
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df1):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    tree_one_gini.fit(X_train, y_train)
    y_pred = tree_one_gini.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [47 48 49 50 51] TEST: [0 1 2 3 4]
precision=0.9608510638297871
recall=   0.9574468085106383
f1=       0.95740826395313

TRAIN: [0 1 2 3 4] TEST: [47 48 49 50 51]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [0 1 2 3 4] TEST: [94 95 96 97 98]
precision=0.9790104947526237
recall=   0.9782608695652174
f1=       0.978141004685627

TRAIN: [0 1 2 3 4] TEST: [140 141 142 143 144]
precision=0.9604743083003953
recall=   0.9565217391304348
f1=       0.956687370600414

TRAIN: [0 1 2 3 4] TEST: [186 187 188 189 190]
precision=0.8922495274102079
recall=   0.8913043478260869
f1=       0.8913557405694317



In [28]:
tree_one_gini_scores = model_selection.cross_validate(tree_one_gini, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
tree_one_gini_scores

{'fit_time': array([0.00272107, 0.00142622, 0.00146818, 0.00122499, 0.00186181]),
 'score_time': array([0.00270796, 0.00233698, 0.00225687, 0.00195503, 0.00288296]),
 'test_precision_weighted': array([0.96099291, 1.        , 0.97909699, 0.93536789, 0.89224953]),
 'test_recall_weighted': array([0.95744681, 1.        , 0.97826087, 0.93478261, 0.89130435]),
 'test_f1_weighted': array([0.95748535, 1.        , 0.97820889, 0.93462666, 0.89135574])}

In [29]:
for i in tree_one_gini_scores:
    tree_one_gini_scores_ms = {
        i : (f'mean= {np.mean(tree_one_gini_scores[i]).round(3)}, std = {np.std(tree_one_gini_scores[i]).round(4)}')
    }
    print(tree_one_gini_scores_ms)

{'fit_time': 'mean= 0.002, std = 0.0005'}
{'score_time': 'mean= 0.002, std = 0.0003'}
{'test_precision_weighted': 'mean= 0.954, std = 0.0373'}
{'test_recall_weighted': 'mean= 0.952, std = 0.0374'}
{'test_f1_weighted': 'mean= 0.952, std = 0.0374'}


It is noticed that the accuracy scores with both gini and entropy as our criterions are the same on this model

## Naïve Bayes Classifier

Implement a decision Naïve Bayes classifier for classification, with each of the above three ways of dealing with missing values  
### i

In [30]:
bayes_one_bernoulli = naive_bayes.BernoulliNB()

In [31]:
bayes_one_bernoulli.fit(X, y)

BernoulliNB()

In [32]:
# print("predicted:", bayes_one_bernoulli.predict(X[-2:]))
# print("truth", y[-2:])

In [33]:
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df1):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    bayes_one_bernoulli.fit(X_train, y_train)
    y_pred = bayes_one_bernoulli.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [47 48 49 50 51] TEST: [0 1 2 3 4]
precision=0.9003974748655599
recall=   0.8936170212765957
f1=       0.8933275437834709

TRAIN: [0 1 2 3 4] TEST: [47 48 49 50 51]
precision=0.8350089626685371
recall=   0.8297872340425532
f1=       0.8308904649330181

TRAIN: [0 1 2 3 4] TEST: [94 95 96 97 98]
precision=0.9790104947526237
recall=   0.9782608695652174
f1=       0.978141004685627

TRAIN: [0 1 2 3 4] TEST: [140 141 142 143 144]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [0 1 2 3 4] TEST: [186 187 188 189 190]
precision=0.8819397993311037
recall=   0.8695652173913043
f1=       0.8690711462450593



In [34]:
bayes_one_bernoulli_scores = model_selection.cross_validate(bayes_one_bernoulli, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
bayes_one_bernoulli_scores

{'fit_time': array([0.00266814, 0.00235081, 0.0016799 , 0.00132513, 0.00144601]),
 'score_time': array([0.00362802, 0.00224924, 0.00208306, 0.0019691 , 0.00181389]),
 'test_precision_weighted': array([0.90092843, 0.83310207, 0.97909699, 1.        , 0.8819398 ]),
 'test_recall_weighted': array([0.89361702, 0.82978723, 0.97826087, 1.        , 0.86956522]),
 'test_f1_weighted': array([0.89361702, 0.82994141, 0.97820889, 1.        , 0.86907115])}

In [35]:
for i in bayes_one_bernoulli_scores:
    bayes_one_bernoulli_scores_ms = {
        i : (f'mean= {np.mean(bayes_one_bernoulli_scores[i]).round(3)}, std = {np.std(bayes_one_bernoulli_scores[i]).round(4)}')
    }
    print(bayes_one_bernoulli_scores_ms)

{'fit_time': 'mean= 0.002, std = 0.0005'}
{'score_time': 'mean= 0.002, std = 0.0007'}
{'test_precision_weighted': 'mean= 0.919, std = 0.0621'}
{'test_recall_weighted': 'mean= 0.914, std = 0.0648'}
{'test_f1_weighted': 'mean= 0.914, std = 0.0648'}


In [36]:
bayes_one_gaussian = naive_bayes.GaussianNB()

In [37]:
bayes_one_gaussian.fit(X, y)

GaussianNB()

In [38]:
print("predicted:", bayes_one_gaussian.predict(X[-2:]))
print("truth", y[-2:])

predicted: [0 1]
truth 430    0
431    1
Name: Class Name, dtype: int64


In [39]:
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df1):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    bayes_one_gaussian.fit(X_train, y_train)
    y_pred = bayes_one_gaussian.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [47 48 49 50 51] TEST: [0 1 2 3 4]
precision=0.937018193031144
recall=   0.9361702127659575
f1=       0.9361702127659575

TRAIN: [0 1 2 3 4] TEST: [47 48 49 50 51]
precision=0.942347288949897
recall=   0.9361702127659575
f1=       0.9350574416567924

TRAIN: [0 1 2 3 4] TEST: [94 95 96 97 98]
precision=0.9790104947526237
recall=   0.9782608695652174
f1=       0.978141004685627

TRAIN: [0 1 2 3 4] TEST: [140 141 142 143 144]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [0 1 2 3 4] TEST: [186 187 188 189 190]
precision=0.916501976284585
recall=   0.9130434782608695
f1=       0.9130434782608695



In [40]:
bayes_one_gaussian_scores = model_selection.cross_validate(bayes_one_gaussian, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
bayes_one_gaussian_scores

{'fit_time': array([0.00256205, 0.00190902, 0.00236201, 0.00152802, 0.00124884]),
 'score_time': array([0.00243783, 0.00261712, 0.00241113, 0.00191212, 0.00209093]),
 'test_precision_weighted': array([0.93713383, 0.94300912, 0.97909699, 1.        , 0.91650198]),
 'test_recall_weighted': array([0.93617021, 0.93617021, 0.97826087, 1.        , 0.91304348]),
 'test_f1_weighted': array([0.93622811, 0.93564148, 0.97820889, 1.        , 0.91304348])}

In [41]:
for i in bayes_one_gaussian_scores:
    bayes_one_gaussian_scores_ms = {
        i : (f'mean= {np.mean(bayes_one_gaussian_scores[i]).round(3)}, std = {np.std(bayes_one_gaussian_scores[i]).round(4)}')
    }
    print(bayes_one_gaussian_scores_ms)

{'fit_time': 'mean= 0.002, std = 0.0005'}
{'score_time': 'mean= 0.002, std = 0.0003'}
{'test_precision_weighted': 'mean= 0.955, std = 0.0302'}
{'test_recall_weighted': 'mean= 0.953, std = 0.0317'}
{'test_f1_weighted': 'mean= 0.953, std = 0.0317'}


## DecisionTree Classifier

### ii

In [42]:
X = df2.drop(['Class Name'], axis=1)
y = df2['Class Name']
tree_two_enp = tree.DecisionTreeClassifier(criterion='entropy')
tree_two_gini = tree.DecisionTreeClassifier(criterion='gini')

print(tree_two_enp)
print(tree_two_gini)

DecisionTreeClassifier(criterion='entropy')
DecisionTreeClassifier()


In [43]:
tree_two_enp.fit(X, y)
tree_two_gini.fit(X, y)

DecisionTreeClassifier()

In [44]:
# predict
y_pred_enp = tree_two_enp.predict(X)
y_pred_gini = tree_two_gini.predict(X)

In [45]:
print(tree_two_enp.predict(X[:2]))
print(tree_two_gini.predict(X[:2]))
print(y[:2])

[0 0]
[0 0]
0    0
1    0
Name: Class Name, dtype: int64


#### performing 5-fold cross validation and report precision, recall, and F1-scores for the second scenario

In [46]:
# k-fold on entropy crietrion
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df2):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    tree_two_enp.fit(X_train, y_train)
    y_pred = tree_two_enp.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [87 88 89 90 91] TEST: [0 1 2 3 4]
precision=0.9656129205097784
recall=   0.9655172413793104
f1=       0.965431569929321

TRAIN: [0 1 2 3 4] TEST: [87 88 89 90 91]
precision=0.9413793103448277
recall=   0.9310344827586207
f1=       0.9317614165890029

TRAIN: [0 1 2 3 4] TEST: [174 175 176 177 178]
precision=0.9770114942528736
recall=   0.9770114942528736
f1=       0.9770114942528736

TRAIN: [0 1 2 3 4] TEST: [261 262 263 264 265]
precision=0.9335349868925186
recall=   0.9310344827586207
f1=       0.9301332288401254

TRAIN: [0 1 2 3 4] TEST: [348 349 350 351 352]
precision=0.9080459770114943
recall=   0.9080459770114943
f1=       0.9080459770114943



In [47]:
tree_two_enp_scores = model_selection.cross_validate(tree_two_enp, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
tree_two_enp_scores

{'fit_time': array([0.00194097, 0.00126219, 0.00153923, 0.00141168, 0.0010922 ]),
 'score_time': array([0.00237489, 0.00244403, 0.0023489 , 0.00185537, 0.00178719]),
 'test_precision_weighted': array([0.96559464, 0.94137931, 0.96559464, 0.90804598, 0.93103448]),
 'test_recall_weighted': array([0.96551724, 0.93103448, 0.96551724, 0.90804598, 0.93103448]),
 'test_f1_weighted': array([0.96542104, 0.93176142, 0.96542104, 0.90804598, 0.93103448])}

In [48]:
for i in tree_two_enp_scores:
    tree_two_enp_scores_ms = {
        i : (f'mean= {np.mean(tree_two_enp_scores[i]).round(3)}, std = {np.std(tree_two_enp_scores[i]).round(4)}')
    }
    print(tree_two_enp_scores_ms)

{'fit_time': 'mean= 0.001, std = 0.0003'}
{'score_time': 'mean= 0.002, std = 0.0003'}
{'test_precision_weighted': 'mean= 0.942, std = 0.0218'}
{'test_recall_weighted': 'mean= 0.94, std = 0.0223'}
{'test_f1_weighted': 'mean= 0.94, std = 0.0222'}


In [49]:
# k-fold on gini crietrion
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df2):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    tree_two_gini.fit(X_train, y_train)
    y_pred = tree_two_gini.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [87 88 89 90 91] TEST: [0 1 2 3 4]
precision=0.9540229885057471
recall=   0.9540229885057471
f1=       0.9540229885057471

TRAIN: [0 1 2 3 4] TEST: [87 88 89 90 91]
precision=0.9498968464485705
recall=   0.9425287356321839
f1=       0.9430743735686747

TRAIN: [0 1 2 3 4] TEST: [174 175 176 177 178]
precision=0.9770114942528736
recall=   0.9770114942528736
f1=       0.9770114942528736

TRAIN: [0 1 2 3 4] TEST: [261 262 263 264 265]
precision=0.9335349868925186
recall=   0.9310344827586207
f1=       0.9301332288401254

TRAIN: [0 1 2 3 4] TEST: [348 349 350 351 352]
precision=0.9361457831771548
recall=   0.9310344827586207
f1=       0.9315979265269326



In [50]:
tree_two_gini_scores = model_selection.cross_validate(tree_two_gini, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
tree_two_gini_scores

{'fit_time': array([0.00235987, 0.00154018, 0.00138378, 0.00156021, 0.00119305]),
 'score_time': array([0.00300717, 0.00206494, 0.00212526, 0.0023191 , 0.00190592]),
 'test_precision_weighted': array([0.95402299, 0.93614578, 0.96559464, 0.89615987, 0.91366263]),
 'test_recall_weighted': array([0.95402299, 0.93103448, 0.96551724, 0.89655172, 0.90804598]),
 'test_f1_weighted': array([0.95402299, 0.93159793, 0.96542104, 0.8962305 , 0.90890489])}

In [51]:
for i in tree_two_gini_scores:
    tree_two_gini_scores_ms = {
        i : (f'mean= {np.mean(tree_two_gini_scores[i]).round(3)}, std = {np.std(tree_two_gini_scores[i]).round(4)}')
    }
    print(tree_two_gini_scores_ms)

{'fit_time': 'mean= 0.002, std = 0.0004'}
{'score_time': 'mean= 0.002, std = 0.0004'}
{'test_precision_weighted': 'mean= 0.933, std = 0.0255'}
{'test_recall_weighted': 'mean= 0.931, std = 0.0262'}
{'test_f1_weighted': 'mean= 0.931, std = 0.0261'}


This time it is noticed that the accuracy scores are slightly higher with entropy as our criterion on this model

## Naïve Bayes Classifier

### ii

In [52]:
bayes_two_gaussian = naive_bayes.GaussianNB()

In [53]:
bayes_two_gaussian.fit(X, y)

GaussianNB()

In [54]:
bayes_two_gaussian.predict(X)

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,

In [55]:
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df2):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    bayes_two_gaussian.fit(X_train, y_train)
    y_pred = bayes_two_gaussian.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [87 88 89 90 91] TEST: [0 1 2 3 4]
precision=0.9454547699297504
recall=   0.9425287356321839
f1=       0.9428561183940785

TRAIN: [0 1 2 3 4] TEST: [87 88 89 90 91]
precision=0.9540229885057471
recall=   0.9540229885057471
f1=       0.9540229885057471

TRAIN: [0 1 2 3 4] TEST: [174 175 176 177 178]
precision=0.9544193420531115
recall=   0.9540229885057471
f1=       0.9536600120992136

TRAIN: [0 1 2 3 4] TEST: [261 262 263 264 265]
precision=0.9673645320197043
recall=   0.9655172413793104
f1=       0.9651960187866546

TRAIN: [0 1 2 3 4] TEST: [348 349 350 351 352]
precision=0.9000745573159367
recall=   0.896551724137931
f1=       0.89723072280351



In [56]:
bayes_two_gaussian_scores = model_selection.cross_validate(bayes_two_gaussian, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
bayes_two_gaussian_scores

{'fit_time': array([0.00231099, 0.00136614, 0.001122  , 0.00159001, 0.00104499]),
 'score_time': array([0.00238585, 0.00208783, 0.00200105, 0.00197601, 0.00190902]),
 'test_precision_weighted': array([0.94556695, 0.95402299, 0.94387944, 0.98871473, 0.90027045]),
 'test_recall_weighted': array([0.94252874, 0.95402299, 0.94252874, 0.98850575, 0.89655172]),
 'test_f1_weighted': array([0.94290596, 0.95402299, 0.94199336, 0.98847006, 0.89732277])}

In [57]:
for i in bayes_two_gaussian_scores:
    bayes_two_gaussian_scores_ms = {
        i : (f'mean= {np.mean(bayes_two_gaussian_scores[i]).round(3)}, std = {np.std(bayes_two_gaussian_scores[i]).round(4)}')
    }
    print(bayes_two_gaussian_scores_ms)

{'fit_time': 'mean= 0.001, std = 0.0005'}
{'score_time': 'mean= 0.002, std = 0.0002'}
{'test_precision_weighted': 'mean= 0.946, std = 0.0282'}
{'test_recall_weighted': 'mean= 0.945, std = 0.0294'}
{'test_f1_weighted': 'mean= 0.945, std = 0.0292'}


In [58]:
bayes_two_bernoulli = naive_bayes.BernoulliNB()

In [59]:
bayes_two_bernoulli.fit(X, y)

BernoulliNB()

In [60]:
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df2):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    bayes_two_bernoulli.fit(X_train, y_train)
    y_pred = bayes_two_bernoulli.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [87 88 89 90 91] TEST: [0 1 2 3 4]
precision=0.9183213271413074
recall=   0.9080459770114943
f1=       0.9088608220057534

TRAIN: [0 1 2 3 4] TEST: [87 88 89 90 91]
precision=0.8649516705062104
recall=   0.8505747126436781
f1=       0.8522651805410425

TRAIN: [0 1 2 3 4] TEST: [174 175 176 177 178]
precision=0.9544193420531115
recall=   0.9540229885057471
f1=       0.9536600120992136

TRAIN: [0 1 2 3 4] TEST: [261 262 263 264 265]
precision=0.9424706838499942
recall=   0.9425287356321839
f1=       0.9423684029097185

TRAIN: [0 1 2 3 4] TEST: [348 349 350 351 352]
precision=0.8421764727392401
recall=   0.8275862068965517
f1=       0.8295367467781262



In [61]:
bayes_two_bernoulli_scores = model_selection.cross_validate(bayes_two_bernoulli, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
bayes_two_bernoulli_scores

{'fit_time': array([0.002496  , 0.00200176, 0.00123692, 0.00166726, 0.00119615]),
 'score_time': array([0.00301003, 0.00211406, 0.00185204, 0.00194716, 0.00199986]),
 'test_precision_weighted': array([0.91864759, 0.86495167, 0.94387944, 0.95402299, 0.842865  ]),
 'test_recall_weighted': array([0.90804598, 0.85057471, 0.94252874, 0.95402299, 0.82758621]),
 'test_f1_weighted': array([0.90901522, 0.85226518, 0.94199336, 0.95402299, 0.82987789])}

In [62]:
for i in bayes_two_bernoulli_scores:
    bayes_two_bernoulli_scores_ms = {
        i : (f'mean= {np.mean(bayes_two_bernoulli_scores[i]).round(3)}, std = {np.std(bayes_two_bernoulli_scores[i]).round(4)}')
    }
    print(bayes_two_bernoulli_scores_ms)

{'fit_time': 'mean= 0.002, std = 0.0005'}
{'score_time': 'mean= 0.002, std = 0.0004'}
{'test_precision_weighted': 'mean= 0.905, std = 0.0437'}
{'test_recall_weighted': 'mean= 0.897, std = 0.0498'}
{'test_f1_weighted': 'mean= 0.897, std = 0.0488'}


## DecisionTree Classifier

### iii

In [63]:
X = df3.drop(['Class Name'], axis=1)
y = df3['Class Name']
tree_three_enp = tree.DecisionTreeClassifier(criterion='entropy')
tree_three_gini = tree.DecisionTreeClassifier(criterion='gini')

print(tree_three_enp)
print(tree_three_gini)

DecisionTreeClassifier(criterion='entropy')
DecisionTreeClassifier()


In [64]:
tree_three_enp.fit(X, y)
tree_three_gini.fit(X, y)

DecisionTreeClassifier()

In [65]:
# predict
y_pred_enp = tree_three_enp.predict(X)
y_pred_gini = tree_three_gini.predict(X)

In [66]:
print(tree_three_enp.predict(X[:2]))
print(tree_three_gini.predict(X[:2]))
print(y[:2])

[0 0]
[0 0]
0    0
1    0
Name: Class Name, dtype: int64


#### performing 5-fold cross validation and report precision, recall, and F1-scores for the third scenario

In [67]:
# on entropy criterion
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df3):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    tree_three_enp.fit(X_train, y_train)
    y_pred = tree_three_enp.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [87 88 89 90 91] TEST: [0 1 2 3 4]
precision=0.9770114942528736
recall=   0.9770114942528736
f1=       0.9770114942528736

TRAIN: [0 1 2 3 4] TEST: [87 88 89 90 91]
precision=0.9588626739261947
recall=   0.9540229885057471
f1=       0.9543986176846216

TRAIN: [0 1 2 3 4] TEST: [174 175 176 177 178]
precision=0.9556871395951856
recall=   0.9540229885057471
f1=       0.9543234064785788

TRAIN: [0 1 2 3 4] TEST: [261 262 263 264 265]
precision=0.931269592476489
recall=   0.9310344827586207
f1=       0.9306281202832927

TRAIN: [0 1 2 3 4] TEST: [348 349 350 351 352]
precision=0.9455669462566014
recall=   0.9425287356321839
f1=       0.942905957113061



In [68]:
tree_three_enp_scores = model_selection.cross_validate(tree_three_enp, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
tree_three_enp_scores

{'fit_time': array([0.00211692, 0.00149298, 0.00119877, 0.00112677, 0.00137496]),
 'score_time': array([0.00278306, 0.00222778, 0.00179601, 0.00180006, 0.00198627]),
 'test_precision_weighted': array([0.97701149, 0.95886267, 0.96559464, 0.96733212, 0.93632805]),
 'test_recall_weighted': array([0.97701149, 0.95402299, 0.96551724, 0.96551724, 0.93103448]),
 'test_f1_weighted': array([0.97701149, 0.95439862, 0.96542104, 0.96516221, 0.93167867])}

In [69]:
for i in tree_three_enp_scores:
    tree_three_enp_scores_ms = {
        i : (f'mean = {np.mean(tree_three_enp_scores[i]).round(3)}, std= {np.std(tree_three_enp_scores[i]).round(3)}')
    }
    print(tree_three_enp_scores_ms)

{'fit_time': 'mean = 0.001, std= 0.0'}
{'score_time': 'mean = 0.002, std= 0.0'}
{'test_precision_weighted': 'mean = 0.961, std= 0.014'}
{'test_recall_weighted': 'mean = 0.959, std= 0.016'}
{'test_f1_weighted': 'mean = 0.959, std= 0.015'}


In [70]:
# on gini criterion
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df3):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    tree_three_gini.fit(X_train, y_train)
    y_pred = tree_three_gini.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [87 88 89 90 91] TEST: [0 1 2 3 4]
precision=0.9770114942528736
recall=   0.9770114942528736
f1=       0.9770114942528736

TRAIN: [0 1 2 3 4] TEST: [87 88 89 90 91]
precision=0.9498968464485705
recall=   0.9425287356321839
f1=       0.9430743735686747

TRAIN: [0 1 2 3 4] TEST: [174 175 176 177 178]
precision=0.9556871395951856
recall=   0.9540229885057471
f1=       0.9543234064785788

TRAIN: [0 1 2 3 4] TEST: [261 262 263 264 265]
precision=0.9380479251899474
recall=   0.9310344827586207
f1=       0.9295447322421738

TRAIN: [0 1 2 3 4] TEST: [348 349 350 351 352]
precision=0.9326872511456689
recall=   0.9310344827586207
f1=       0.9313565744600227



In [71]:
tree_three_gini_scores = model_selection.cross_validate(tree_three_gini, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
tree_three_gini_scores

{'fit_time': array([0.00174785, 0.00130701, 0.0013628 , 0.00117612, 0.00107789]),
 'score_time': array([0.00264978, 0.00186014, 0.00205588, 0.00179291, 0.00183082]),
 'test_precision_weighted': array([0.97701149, 0.95886267, 0.95402299, 0.93119339, 0.93277757]),
 'test_recall_weighted': array([0.97701149, 0.95402299, 0.95402299, 0.93103448, 0.93103448]),
 'test_f1_weighted': array([0.97701149, 0.95439862, 0.95402299, 0.93058386, 0.93139806])}

In [72]:
for i in tree_three_gini_scores:
    tree_three_gini_scores_ms = {
        i : (f'mean = {np.mean(tree_three_gini_scores[i]).round(3)}, std= {np.std(tree_three_gini_scores[i]).round(3)}')
    }
    print(tree_three_gini_scores_ms)

{'fit_time': 'mean = 0.001, std= 0.0'}
{'score_time': 'mean = 0.002, std= 0.0'}
{'test_precision_weighted': 'mean = 0.951, std= 0.017'}
{'test_recall_weighted': 'mean = 0.949, std= 0.017'}
{'test_f1_weighted': 'mean = 0.949, std= 0.017'}


## Naïve Bayes Classifier

### iii

In [73]:
bayes_three_bernoulli = naive_bayes.BernoulliNB()

In [74]:
bayes_three_bernoulli.fit(X, y)

BernoulliNB()

In [75]:
print(bayes_three_bernoulli.predict(X[-2:]))
print(y[-2:])

[0 0]
433    0
434    0
Name: Class Name, dtype: int64


In [76]:
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df3):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    bayes_three_bernoulli.fit(X_train, y_train)
    y_pred = bayes_three_bernoulli.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [87 88 89 90 91] TEST: [0 1 2 3 4]
precision=0.913203654582965
recall=   0.9080459770114943
f1=       0.9086921404162783

TRAIN: [0 1 2 3 4] TEST: [87 88 89 90 91]
precision=0.8731841526045487
recall=   0.8620689655172413
f1=       0.8635228331780057

TRAIN: [0 1 2 3 4] TEST: [174 175 176 177 178]
precision=0.9544193420531115
recall=   0.9540229885057471
f1=       0.9536600120992136

TRAIN: [0 1 2 3 4] TEST: [261 262 263 264 265]
precision=0.9424706838499942
recall=   0.9425287356321839
f1=       0.9423684029097185

TRAIN: [0 1 2 3 4] TEST: [348 349 350 351 352]
precision=0.8570333880678709
recall=   0.8390804597701149
f1=       0.8409817647567195



In [77]:
bayes_three_bernoulli_scores = model_selection.cross_validate(bayes_three_bernoulli, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
bayes_three_bernoulli_scores

{'fit_time': array([0.00176215, 0.00153399, 0.00170994, 0.00131011, 0.00143909]),
 'score_time': array([0.0024929 , 0.00252199, 0.00224519, 0.00192499, 0.001755  ]),
 'test_precision_weighted': array([0.91342889, 0.87318415, 0.94387944, 0.9659765 , 0.85775405]),
 'test_recall_weighted': array([0.90804598, 0.86206897, 0.94252874, 0.96551724, 0.83908046]),
 'test_f1_weighted': array([0.90879724, 0.86352283, 0.94199336, 0.96561344, 0.84134203])}

In [78]:
for i in bayes_three_bernoulli_scores:
    bayes_three_bernoulli_scores_ms = {
        i : (f'mean= {np.mean(bayes_three_bernoulli_scores[i]).round(3)}, std = {np.std(bayes_three_bernoulli_scores[i]).round(4)}')
    }
    print(bayes_three_bernoulli_scores_ms)

{'fit_time': 'mean= 0.002, std = 0.0002'}
{'score_time': 'mean= 0.002, std = 0.0003'}
{'test_precision_weighted': 'mean= 0.911, std = 0.0409'}
{'test_recall_weighted': 'mean= 0.903, std = 0.0474'}
{'test_f1_weighted': 'mean= 0.904, std = 0.0465'}


In [79]:
bayes_three_gaussian = naive_bayes.GaussianNB()

In [80]:
bayes_three_gaussian.fit(X, y)

GaussianNB()

In [81]:
print(bayes_three_gaussian.predict(X[-2:]))
print(y[-2:])

[0 0]
433    0
434    0
Name: Class Name, dtype: int64


In [82]:
kf = model_selection.KFold(n_splits=5, shuffle=False)
for train_index, test_index in kf.split(df3):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    bayes_three_gaussian.fit(X_train, y_train)
    y_pred = bayes_three_gaussian.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()

TRAIN: [87 88 89 90 91] TEST: [0 1 2 3 4]
precision=0.9454547699297504
recall=   0.9425287356321839
f1=       0.9428561183940785

TRAIN: [0 1 2 3 4] TEST: [87 88 89 90 91]
precision=0.9000745573159367
recall=   0.896551724137931
f1=       0.89723072280351

TRAIN: [0 1 2 3 4] TEST: [174 175 176 177 178]
precision=0.9544193420531115
recall=   0.9540229885057471
f1=       0.9536600120992136

TRAIN: [0 1 2 3 4] TEST: [261 262 263 264 265]
precision=0.9655946437555633
recall=   0.9655172413793104
f1=       0.9654210417458309

TRAIN: [0 1 2 3 4] TEST: [348 349 350 351 352]
precision=0.8679951109300345
recall=   0.8620689655172413
f1=       0.8631958530538654



In [83]:
bayes_three_gaussian_scores = model_selection.cross_validate(bayes_three_gaussian, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
bayes_three_gaussian_scores

{'fit_time': array([0.00163078, 0.00196505, 0.00112581, 0.00096488, 0.00112009]),
 'score_time': array([0.00303912, 0.00257683, 0.00180721, 0.00180197, 0.0023067 ]),
 'test_precision_weighted': array([0.94556695, 0.90007456, 0.94387944, 0.95402299, 0.86833178]),
 'test_recall_weighted': array([0.94252874, 0.89655172, 0.94252874, 0.95402299, 0.86206897]),
 'test_f1_weighted': array([0.94290596, 0.89723072, 0.94199336, 0.95402299, 0.86335733])}

In [84]:
for i in bayes_three_gaussian_scores:
    bayes_three_gaussian_scores_ms = {
        i : (f'mean= {np.mean(bayes_three_gaussian_scores[i]).round(3)}, std = {np.std(bayes_three_gaussian_scores[i]).round(4)}')
    }
    print(bayes_three_gaussian_scores_ms)

{'fit_time': 'mean= 0.001, std = 0.0004'}
{'score_time': 'mean= 0.002, std = 0.0005'}
{'test_precision_weighted': 'mean= 0.922, std = 0.0329'}
{'test_recall_weighted': 'mean= 0.92, std = 0.0349'}
{'test_f1_weighted': 'mean= 0.92, std = 0.0343'}
