### import

In [575]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [576]:
data = pd.read_csv('spambase_csv.csv')
data.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


Unnamed: 0,word_freq_make,word_freq_all,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,word_freq_receive,word_freq_people,...,word_freq_pm,word_freq_meeting,word_freq_original,word_freq_re,word_freq_edu,char_freq_%21,char_freq_%24,capital_run_length_longest,capital_run_length_total,class
1209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.706,0.0,55,143,1
591,0.45,0.68,1.92,0.0,0.56,0.45,0.0,0.45,0.22,0.0,...,0.0,0.0,0.0,0.0,0.11,0.574,0.134,94,385,1
1188,0.0,0.0,0.0,0.53,0.53,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,81,1
95,0.0,0.46,0.0,0.0,0.46,0.0,0.0,0.0,0.46,0.0,...,0.0,0.0,0.0,0.46,0.0,0.0,0.0,32,91,1
323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,19,1


### shuffeling the dataset:

In [577]:
rand_permutations = np.random.permutation(data.shape[0])
shuffeled_data = data.loc[rand_permutations,:]
shuffeled_data.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
114,0.12,1.76,0.63,0.0,0.88,0.0,0.12,0.5,0.25,3.9,...,0.019,0.379,0.159,0.0,0.119,0.0,4.155,38,507,1
4507,0.23,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.11,...,0.0,0.126,0.0,0.021,0.0,0.0,1.198,5,145,0
1029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.04,0.0,0.0,...,0.0,0.0,0.0,0.539,0.269,0.0,5.787,47,272,1
631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,13,1
1011,0.0,0.0,0.51,0.0,0.51,0.51,0.51,0.0,0.0,0.0,...,0.0,0.09,0.0,0.18,0.0,0.0,1.773,17,94,1


### balancing the dataset:

In [578]:
num_of_spams = np.sum(data['class'])
num_of_zeroes = data.shape[0]-num_of_spams
print('number of spam emails: '+ str(num_of_spams) + ' . number_of_regular_emails: ' +str(num_of_zeroes))
num_examples_to_remove  = num_of_zeroes-num_of_spams

number of spam emails: 1813 . number_of_regular_emails: 2788


we need to remove 975 records of class=0 

In [579]:
index_to_remove = shuffeled_data[shuffeled_data['class'] == 0].index.values[0:975]
balanced_data = shuffeled_data.drop(index_to_remove[0:num_examples_to_remove],axis=0)

now the data is equally balanced:

In [580]:
num_of_spams = np.sum(balanced_data['class'])
num_of_zeroes = balanced_data.shape[0]-num_of_spams
print('number of spam emails: '+ str(num_of_spams) + ' . number_of_regular_emails: ' +str(num_of_zeroes))


number of spam emails: 1813 . number_of_regular_emails: 1813


### checkpoint:

In [581]:
data = balanced_data.copy()

### splitting the dataset into train, val and test:

In [582]:
samples_count = data.shape[0]
train_samples_count = int(0.85*samples_count)
val_samples_count = int(0.15*train_samples_count)
test_samples_count = samples_count-train_samples_count

In [583]:
val = data.iloc[0:val_samples_count,:]
train = data.iloc[val_samples_count:train_samples_count,:]
test = data.iloc[train_samples_count:,:]


### creating inputs and targets:

In [584]:
## train:
train_inputs = train.drop(['class'],axis=1)
train_targets = train['class']

## val:
val_inputs = val.drop(['class'],axis=1)
val_targets = val['class']

## test:
test_inputs = test.drop(['class'],axis=1)
test_targets = test['class']


### standardizing the inputs by the data of train only!

In [585]:
## creating scaler from the train dataset only:
scaler = StandardScaler()

scaler.fit(train_inputs)
scaled_train_inputs = scaler.transform(train_inputs)

## using the scaler to standardize the val inputs and test inputs:
scaled_val_inputs = scaler.transform(val_inputs)
scaled_test_inputs = scaler.transform(test_inputs)


### modeling:

setting parameters:

In [680]:
input_size = train_inputs.shape[1]
output_size = 2 ## sincewe have only 2 classes:
hidden_layer_size = 50
batch_size =20
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)
num_of_epochs = 100


model:

In [681]:
model = tf.keras.Sequential([tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                            tf.keras.layers.Dense(hidden_layer_size,activation='relu'),
                            tf.keras.layers.Dense(output_size,activation='softmax'),])

In [682]:
model.compile(optimizer='adam' , loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

In [683]:
model.fit(x=scaled_train_inputs, y=train_targets,batch_size=batch_size,epochs=num_of_epochs,
         validation_data=(scaled_val_inputs,val_targets), verbose=2, callbacks=[early_stopping])

Epoch 1/100
131/131 - 1s - loss: 0.3769 - accuracy: 0.8630 - val_loss: 0.3284 - val_accuracy: 0.8593
Epoch 2/100
131/131 - 0s - loss: 0.2012 - accuracy: 0.9290 - val_loss: 0.3100 - val_accuracy: 0.8810
Epoch 3/100
131/131 - 0s - loss: 0.1639 - accuracy: 0.9412 - val_loss: 0.2965 - val_accuracy: 0.8788
Epoch 4/100
131/131 - 0s - loss: 0.1454 - accuracy: 0.9500 - val_loss: 0.2661 - val_accuracy: 0.9026
Epoch 5/100
131/131 - 0s - loss: 0.1352 - accuracy: 0.9511 - val_loss: 0.2120 - val_accuracy: 0.9221
Epoch 6/100
131/131 - 0s - loss: 0.1240 - accuracy: 0.9592 - val_loss: 0.2864 - val_accuracy: 0.9004
Epoch 7/100
131/131 - 0s - loss: 0.1124 - accuracy: 0.9592 - val_loss: 0.3275 - val_accuracy: 0.8853


<tensorflow.python.keras.callbacks.History at 0x1d8b2732130>

### test the model:

In [679]:
predictions = model.evaluate(scaled_test_inputs,test_targets)
#accuracy_score(test_targets,predictions)



In [612]:
reg = LogisticRegression()
reg.fit(scaled_train_inputs,train_targets)
reg.score(scaled_test_inputs,test_targets)


0.9264705882352942