In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA

In [107]:
#######################
# Your code goes here #
#######################
data = pd.read_csv('./spambase.data', header=None)
labels = np.array((data[len(data.columns)-1].values.tolist()))
del data[len(data.columns)-1]
data = np.array(data.values.tolist())
X, y = shuffle(data, labels, random_state=0)

In [108]:
X = np.delete(X, [46 ], axis = 1)

In [109]:
#######################
# Your code goes here #
#######################
def norm1(X):
    temp = X.copy()
    min = np.amin(temp, axis = 0)
    temp -= min
    max = np.max(temp, axis = 0)
    temp /= max
    return temp
def norm2(X):
    temp = X.copy()
    temp = (temp - np.mean(temp, axis = 0))/np.std(temp, axis = 0)
    return temp


norm1_x = norm1(X)
norm2_x = norm2(X)


In [110]:
#######################
# Your code goes here #
#######################
test_precentage = 20
def split(X, y, test_precentage):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_precentage/100, random_state=0)
    return x_train, x_test, y_train, y_test
x_train, x_test, y_train, y_test = split(norm2_x, y, test_precentage)

In [111]:
def k_fold_validation(k, x, y):
    '''
    the function each of the k splits, val 
    and train sets data and labels
    '''
    k_folds = KFold(n_splits=k)
    splits = []
    for train_index, val_index in k_folds.split(x, y):
        splits += [{"x_train":x[train_index], "x_val":x[val_index], "y_train":y[train_index], "y_val":y[val_index]}]
    return splits

def k_fold_performance(splits):
    '''
    This function returns the models
    (decision trees) and their corresponding accuracies
    when the dataset is given in the format of the return
    object of the above function
    '''
    performance = dict()
    performance['Training Set Accuracy'] = []
    performance['Validation Set Accuracy'] = []
    performance['Testing Set Accuracy'] = []
    trees = []
    for i in range(len(splits)):
        split = splits[i]
        train_set = split['x_train']
        val_set = split['x_val']
        val_y = split['y_val']
        train_y = split['y_train']
        clf = DecisionTreeClassifier()
        clf.fit(train_set, train_y)
        trees += [clf]
        performance['Training Set Accuracy'] += [clf.score(train_set, train_y)*100]
        performance['Validation Set Accuracy'] += [clf.score(val_set, val_y)*100]
        performance['Testing Set Accuracy'] += [clf.score(x_test, y_test)*100]
    return performance, trees

# Initialize K and split the data

k = 15

# Splitting and evaluating

performance, trees = k_fold_performance(k_fold_validation(k, x_train, y_train))
print(f"Average Training Accuracy, over {k} validations: {sum(performance['Training Set Accuracy'])/len(performance['Training Set Accuracy'])}%")

# This contains the tree models, and their respective performances

df = pd.DataFrame(performance)
df.index.names = ['Validation']
df.columns.names = ['Metrics']
df

#Run the K fold Validation and report the scores

#######################
# Your code goes here #
#######################


Average Training Accuracy, over 15 validations: 99.92236043537565%


Metrics,Training Set Accuracy,Validation Set Accuracy,Testing Set Accuracy
Validation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,99.941759,89.430894,91.530945
1,99.941759,91.463415,91.313789
2,99.912638,92.682927,90.770901
3,99.912638,89.837398,91.096634
4,99.912638,89.837398,90.553746
5,99.912664,92.653061,90.879479
6,99.912664,91.836735,90.336591
7,99.912664,90.204082,90.010858
8,99.970888,90.204082,92.290988
9,99.912664,91.020408,91.422367


In [112]:
def ensemble_components(components, x_train, y_train, n_prime):
    ensemble = []
    for _ in range(components):
        clf = DecisionTreeClassifier(max_features= 5)
        # Selecting features
        rand_idx = np.random.randint(0, x_train.shape[0], n_prime)
        train_labels = y_train[rand_idx]
        train_set = x_train[rand_idx]
        # bagging 
        clf.fit(train_set, train_labels)
        ensemble += [clf]
    return ensemble

def random_forest_algorithm(number_of_trees, x_train, y_train, n_prime): # Pass necessary params as per requirements
    '''
    This function intends to return the indivdual trees,
    which are going to be used for predicting the ensemble
    '''
    ensemble = ensemble_components(number_of_trees, x_train, y_train, n_prime)
    return ensemble


def test(ensemble, test_set, test_labels):
    preds = [i.predict(test_set) for i in ensemble]
    preds = np.array(preds)
    preds = np.mean(preds, axis = 0)
    for i in range(len(preds)):
        if(preds[i] >= 0.5):
            preds[i] = 1
        else:
            preds[i] = 0
    preds = preds.astype(int)
    score = accuracy_score(test_labels, preds)
    return preds, score

temp_1 = []
for _ in range(50):
    ensembles = random_forest_algorithm(100, x_train, y_train,( (len(x_train)*9)//10))
    test_preds, score = test(ensembles, x_test, y_test)
    print(f"Accuracy on the testing Set (Random Forests): {score*100}%")
    temp_1 += [score]
print(np.mean(temp_1))




#######################
# Your code goes here #
#######################

Accuracy on the testing Set (Random Forests): 95.33116178067318%
Accuracy on the testing Set (Random Forests): 95.65689467969598%
Accuracy on the testing Set (Random Forests): 95.33116178067318%
Accuracy on the testing Set (Random Forests): 95.98262757871878%
Accuracy on the testing Set (Random Forests): 95.43973941368078%
Accuracy on the testing Set (Random Forests): 95.43973941368078%
Accuracy on the testing Set (Random Forests): 95.33116178067318%
Accuracy on the testing Set (Random Forests): 96.09120521172639%
Accuracy on the testing Set (Random Forests): 95.43973941368078%
Accuracy on the testing Set (Random Forests): 95.33116178067318%
Accuracy on the testing Set (Random Forests): 95.54831704668838%
Accuracy on the testing Set (Random Forests): 95.65689467969598%
Accuracy on the testing Set (Random Forests): 95.65689467969598%
Accuracy on the testing Set (Random Forests): 96.19978284473399%
Accuracy on the testing Set (Random Forests): 95.65689467969598%
Accuracy on the testing S

In [113]:
np.max(temp_1)

0.9619978284473398

In [114]:
temp = [
0.30536
,1.2906
,0.50414
,1.3952
,0.67251
,0.27382
,0.39144
,0.40107
,0.27862
,0.64476
,0.20154
,0.8617
,0.30104
,0.33518
,0.25884
,0.82579
,0.44406
,0.53112
,1.7755
,0.50977
,1.2008
,1.0258
,0.35029
,0.44264
,1.6713
,0.88696
,3.3673
,0.53858
,0.59333
,0.45668
,0.40339
,0.32856
,0.55591
,0.32945
,0.53226
,0.40262
,0.42345
,0.22065
,0.43467
,0.34992
,0.3612
,0.76682
,0.22381
,0.62198
,1.0117
,0.91112
,0.076274
,0.28573
,0.24347
,0.27036
,0.10939
,0.81567
,0.24588
,0.42934
,31.729
,194.89
,606.35
,0.4887
]

In [115]:
np.argsort(temp)

array([46, 50, 10, 37, 42, 48, 52, 14, 49,  5,  8, 47, 12,  0, 31, 33, 13,
       39, 22, 40,  6,  7, 35, 30, 36, 53, 38, 23, 16, 29, 57,  2, 19, 17,
       34, 27, 32, 28, 43,  9,  4, 41, 51, 15, 11, 25, 45, 44, 21, 20,  1,
        3, 24, 18, 26, 54, 55, 56])