# Importing the necessary packages

In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Loading the data to a pandas dataframe

In [20]:
df = pd.read_csv('spambase.data', header=None)

X = df.iloc[:,0:56].values # Values in Columns - 1 are stored as features
y = df.iloc[:,57].values # Last column is stored as labels

print (X.shape, y.shape)

(4601, 56) (4601,)


In [21]:
# Creating a pandas dataframe to store the results as a table
result_table = pd.DataFrame(columns=['False Positive','False Negative','Overall error rate']) 

# Performing mean zero normalization on data

In [22]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initializing an ensemble machine learning classifier (Random Forests)

In [23]:
clf = RandomForestClassifier(n_estimators=30,n_jobs=-2)

# Performing classification on each of the 10 folds of KFold cross-validation

In [24]:
# Initializing KFold and setting no. of splits as 10
cv = KFold(n_splits=10, random_state=42, shuffle=True) 

for i, (train_index, test_index) in enumerate(cv.split(X)):
    clf.fit(X[train_index],y[train_index]) # Training the model on a particular fold
    
    y_pred = clf.predict(X[test_index]) # predicting the labels
    
    # Creating confusion matrix to get False positives and False negatives
    cm = confusion_matrix(y[test_index],y_pred) 
    
    # Storing the result of each fold as a row of result table
    result_table.loc[i] = [cm[0,1],cm[1,0],accuracy_score(y[test_index], y_pred)]  
    
# The mean of each column is calculated and stored as the final row   
result_table.loc[i+1] = [result_table['False Positive'].mean(),result_table['False Negative'].mean(),
                         result_table['Overall error rate'].mean()]
result_table

Unnamed: 0,False Positive,False Negative,Overall error rate
0,4.0,28.0,0.930586
1,7.0,8.0,0.967391
2,7.0,10.0,0.963043
3,6.0,14.0,0.956522
4,12.0,13.0,0.945652
5,7.0,12.0,0.958696
6,12.0,14.0,0.943478
7,7.0,8.0,0.967391
8,8.0,15.0,0.95
9,8.0,10.0,0.96087
