In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [2]:
df = pd.read_csv('emails_cleaned.csv')
df.reset_index(drop=True, inplace=True)
df.dropna(inplace=True)
df.head(5)

Unnamed: 0,file,subject,subject_length,text,text_length,class,subject_stripped,text_stripped,subject_lemmatized,text_lemmatized,text_no_stops,subject_no_stops,topic_number,topic
0,9-208msg1.txt,"a workshop on text , speech and dialog ( tsd '...",52,first announcement and call for papers a works...,6327,0,a workshop on text speech and dialog tsd 98,first announcement and call for papers a works...,a workshop on text speech and dialog tsd 98,first announcement and call for paper a worksh...,announcement paper workshop text speech dia...,workshop text speech dialog tsd 98,2,science
1,6-863msg1.txt,re : toc,9,"dear colleague , i would like to send you the ...",1265,0,re toc,dear colleague i would like to send you the f...,re toc,dear colleague i would like to send -PRON- t...,dear colleague like send -PRON- follow anno...,toc,1,language
2,spmsga152.txt,free internet services & unique shopping,41,here 's a great directory for free and interes...,553,1,free internet services unique shopping,here s a great directory for free and interest...,free internet service unique shopping,here s a great directory for free and interest...,s great directory free interesting internet si...,free internet service unique shopping,3,money
3,6-977msg1.txt,summary,8,"dear all , i send you a summary of the answers...",6793,0,summary,dear all i send you a summary of the answers ...,summary,dear all i send -PRON- a summary of the answ...,dear send -PRON- summary answer -PRON- quer...,summary,1,language
4,8-1096msg1.txt,summary : vowel deletion between two like cons...,53,"quite some time ago , i wrote requesting infor...",5793,0,summary vowel deletion between two like conso...,quite some time ago i wrote requesting inform...,summary vowel deletion between two like cons...,quite some time ago i write request informat...,time ago write request information concern ...,summary vowel deletion like consonant,1,language


In [3]:
df.shape

(2809, 14)

In [4]:
emails = df[['subject_length','text_length','subject_lemmatized','text_lemmatized','topic','class']]
emails.head(5)

Unnamed: 0,subject_length,text_length,subject_lemmatized,text_lemmatized,topic,class
0,52,6327,a workshop on text speech and dialog tsd 98,first announcement and call for paper a worksh...,science,0
1,9,1265,re toc,dear colleague i would like to send -PRON- t...,language,0
2,41,553,free internet service unique shopping,here s a great directory for free and interest...,money,1
3,8,6793,summary,dear all i send -PRON- a summary of the answ...,language,0
4,53,5793,summary vowel deletion between two like cons...,quite some time ago i write request informat...,language,0


In [5]:
emails.shape

(2809, 6)

## Dummy Encode Topics

In [6]:
encoded = pd.get_dummies(emails, columns=['topic'])
encoded.head(5)

Unnamed: 0,subject_length,text_length,subject_lemmatized,text_lemmatized,class,topic_Academics,topic_administration,topic_language,topic_money,topic_science
0,52,6327,a workshop on text speech and dialog tsd 98,first announcement and call for paper a worksh...,0,0,0,0,0,1
1,9,1265,re toc,dear colleague i would like to send -PRON- t...,0,0,0,1,0,0
2,41,553,free internet service unique shopping,here s a great directory for free and interest...,1,0,0,0,1,0
3,8,6793,summary,dear all i send -PRON- a summary of the answ...,0,0,0,1,0,0
4,53,5793,summary vowel deletion between two like cons...,quite some time ago i write request informat...,0,0,0,1,0,0


## Normalise Numeric Features

In [7]:
encoded['subject_normalised'] = encoded['subject_length'].apply(
    lambda x: (x-encoded['subject_length'].min())/(encoded['subject_length'].max() -encoded['subject_length'].min())
)

encoded['text_normalised'] = encoded['text_length'].apply(
    lambda x: (x-encoded['text_length'].min())/(encoded['text_length'].max() -encoded['text_length'].min())
)

In [8]:
encoded.head(5)

Unnamed: 0,subject_length,text_length,subject_lemmatized,text_lemmatized,class,topic_Academics,topic_administration,topic_language,topic_money,topic_science,subject_normalised,text_normalised
0,52,6327,a workshop on text speech and dialog tsd 98,first announcement and call for paper a worksh...,0,0,0,0,0,1,0.3125,0.220383
1,9,1265,re toc,dear colleague i would like to send -PRON- t...,0,0,0,1,0,0,0.04375,0.043588
2,41,553,free internet service unique shopping,here s a great directory for free and interest...,1,0,0,0,1,0,0.24375,0.01872
3,8,6793,summary,dear all i send -PRON- a summary of the answ...,0,0,0,1,0,0,0.0375,0.236658
4,53,5793,summary vowel deletion between two like cons...,quite some time ago i write request informat...,0,0,0,1,0,0,0.31875,0.201732


## Test using Numeric Features only 

### Train test split 

In [9]:
y = encoded['class']
encoded.drop(columns=['subject_lemmatized','text_lemmatized','class','subject_length','text_length'],inplace=True)
X = encoded

In [10]:
X.head()

Unnamed: 0,topic_Academics,topic_administration,topic_language,topic_money,topic_science,subject_normalised,text_normalised
0,0,0,0,0,1,0.3125,0.220383
1,0,0,1,0,0,0.04375,0.043588
2,0,0,0,1,0,0.24375,0.01872
3,0,0,1,0,0,0.0375,0.236658
4,0,0,1,0,0,0.31875,0.201732


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

### Logistic Regression

In [12]:
log_reg = LogisticRegression(solver='lbfgs')

In [13]:
y_pred =log_reg.fit(X_train,y_train).predict(X_test)

In [14]:
print(f"Accuracy: {metrics.accuracy_score(y_test,y_pred)}")
print(f"Balanced Accuracy: {metrics.balanced_accuracy_score(y_test,y_pred)}")
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test,y_pred)}")
print(f"CLassification Report:\n{metrics.classification_report(y_test,y_pred)}")

Accuracy: 0.9012455516014235
Balanced Accuracy: 0.9365541260109951
Confusion Matrix:
[[830 109]
 [  2 183]]
CLassification Report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.94       939
           1       0.63      0.99      0.77       185

   micro avg       0.90      0.90      0.90      1124
   macro avg       0.81      0.94      0.85      1124
weighted avg       0.94      0.90      0.91      1124



As would be expected the classifier predicts the majority class of ham with much greater precision only two False negative predictions. There are however a large number of False Negative predictions. This would suggest that there is not really a classifier bias towards the majority class which is further proved by the 0.99 recall for the positive class.

### Gradient Boosting

In [15]:
grad_boost = GradientBoostingClassifier(n_estimators=10)

In [16]:
y_pred =grad_boost.fit(X_train,y_train).predict(X_test)

print(f"Accuracy: {metrics.accuracy_score(y_test,y_pred)}")
print(f"Balanced Accuracy: {metrics.balanced_accuracy_score(y_test,y_pred)}")
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test,y_pred)}")
print(f"CLassification Report:\n{metrics.classification_report(y_test,y_pred)}")

Accuracy: 0.8718861209964412
Balanced Accuracy: 0.6845983363555249
Confusion Matrix:
[[905  34]
 [110  75]]
CLassification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.93       939
           1       0.69      0.41      0.51       185

   micro avg       0.87      0.87      0.87      1124
   macro avg       0.79      0.68      0.72      1124
weighted avg       0.86      0.87      0.86      1124



The Gradient boosting model when compared to the Logistic Regression model achieves a lower Accuracy and Balanced Accuracy. There appears to be a much greater classifier bias towards the majority class as the both the False Negative and True Negative preditcion have increased.

On the otherhand the True positive class is predicted fewer times but with greater precision.

### SVC

In [17]:
svm = SVC(gamma='auto',random_state=42)

In [18]:
y_pred =svm.fit(X_train,y_train).predict(X_test)

print(f"Accuracy: {metrics.accuracy_score(y_test,y_pred)}")
print(f"Balanced Accuracy: {metrics.balanced_accuracy_score(y_test,y_pred)}")
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test,y_pred)}")
print(f"CLassification Report:\n{metrics.classification_report(y_test,y_pred)}")

Accuracy: 0.9012455516014235
Balanced Accuracy: 0.9365541260109951
Confusion Matrix:
[[830 109]
 [  2 183]]
CLassification Report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.94       939
           1       0.63      0.99      0.77       185

   micro avg       0.90      0.90      0.90      1124
   macro avg       0.81      0.94      0.85      1124
weighted avg       0.94      0.90      0.91      1124



The SVM is clearly biased towards the majority class with a huge jump in the number of False Negative and True Negative. The True Positive has also gretly droped with the recall of only 0.06. 

### Linear SVC

In [19]:
svm_lin = LinearSVC(random_state=42,max_iter=10000)

In [20]:
y_pred =svm_lin.fit(X_train,y_train).predict(X_test)

print(f"Accuracy: {metrics.accuracy_score(y_test,y_pred)}")
print(f"Balanced Accuracy: {metrics.balanced_accuracy_score(y_test,y_pred)}")
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test,y_pred)}")
print(f"CLassification Report:\n{metrics.classification_report(y_test,y_pred)}")

Accuracy: 0.9012455516014235
Balanced Accuracy: 0.9365541260109951
Confusion Matrix:
[[830 109]
 [  2 183]]
CLassification Report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.94       939
           1       0.63      0.99      0.77       185

   micro avg       0.90      0.90      0.90      1124
   macro avg       0.81      0.94      0.85      1124
weighted avg       0.94      0.90      0.91      1124



The Linear SVC is much less biased toward the majority class than the SVC model is. 

## Analysis of models
Overall the logistic regression model worked the best even with out any hyperparamter tuning. The SVC model was the least accurate at predicting Spam. 

## Feature Selection

### Filter-based Feature Selection using Information Gain¶
Information gain refers to the reduction of entropy calculated for a given dataset. By maximising the information gain it allows the data to be split into the most optimal datasets, thus reducing the entropy of the dataset, which in turn leads to a good quality classification.

The filter-based feature selection of the dataset obtains the individual information gain associated with each of the features. The Information gain values will be used to rank the features based on the level of improvement to entropy. This method is useful for finding those features that will likely aid in the prediction process and those that may hinder it. The one drawback to this process is that it is classifier exclusive, so does not consider the individual classifiers biases towards certain features(Brownlee, 2019).

Brownlee, J. (2019). Information Gain and Mutual Information for Machine Learning. [online] Machine Learning Mastery. Available at: https://machinelearningmastery.com/information-gain-and-mutual-information/

In [21]:
mi = dict()

i_scores = mutual_info_classif(X_train, y_train, random_state=42)

for i, j in zip(X_train.columns, i_scores):
    mi[i] = j

df = pd.DataFrame.from_dict(mi, orient="index", columns=["I-Gain"])
df.sort_values(by=["I-Gain"], ascending=False, inplace=True)
df.head(12)

Unnamed: 0,I-Gain
topic_money,0.274075
topic_language,0.083048
text_normalised,0.040962
topic_administration,0.038768
topic_science,0.026125
subject_normalised,0.014895
topic_Academics,0.006878


In [22]:
# retrieve the feature names in reverse order depending on infrmation gain
i_gain = [x[1] for x in sorted([(v, k) for (k, v) in mi.items()], reverse=True)]

In [25]:
# train_ig test_ig split to
X_train_ig, X_test_ig, y_train_ig, y_test_ig = train_test_split(
    X_train, y_train, random_state=42
)

acc_scores = []
for kk in range(1, X_train.shape[1] + 1):
    FS_trans = SelectKBest(mutual_info_classif, k=kk).fit(X_train_ig, y_train_ig)
    X_tR_new = FS_trans.transform(X_train_ig)
    X_tS_new = FS_trans.transform(X_test_ig)
    seg_NB = log_reg.fit(X_tR_new, y_train_ig)
    y_dash = seg_NB.predict(X_tS_new)
    acc = metrics.accuracy_score(y_test_ig, y_dash)
    acc_scores.append(acc)

df["Accuracy"] = acc_scores
df.head(10)

Unnamed: 0,I-Gain,Accuracy
topic_money,0.274075,0.93128
topic_language,0.083048,0.93128
text_normalised,0.040962,0.93128
topic_administration,0.038768,0.93128
topic_science,0.026125,0.93128
subject_normalised,0.014895,0.93128
topic_Academics,0.006878,0.93128


In [28]:
# Obtain the accuracies achived when adding each feature
train = []
accuracies = []
best = 0
best_idx = 0


for i, feature in enumerate(i_gain):
    train.append(feature)

    X_tr_ig = X_train[train]

    X_train_ig, X_test_ig, y_train_ig, y_test_ig = train_test_split(
        X_tr_ig, y_train, random_state=42
    )

    y_pred = log_reg.fit(X_train_ig, y_train_ig).predict(X_test_ig)
    ig_acc = metrics.accuracy_score(y_pred, y_test_ig)
    accuracies.append(ig_acc)
    print(f"Features: {i+1}, {ig_acc}")

    if ig_acc >= best:
        best = ig_acc
        best_idx = i + 1


best_features = train[:best_idx]

Features: 1, 0.9312796208530806
Features: 2, 0.9312796208530806
Features: 3, 0.9312796208530806
Features: 4, 0.9312796208530806
Features: 5, 0.9312796208530806
Features: 6, 0.9312796208530806
Features: 7, 0.9312796208530806


In [None]:
# # display the best features obtained using Information gain
best_features

## Testing using  Single Text features
### Train test split

In [None]:
encoded = pd.get_dummies(emails, columns=['topic'])
encoded.head(3)

In [None]:
y = emails['class']
X = emails['text_lemmatized']

In [None]:
X.head(3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

In [None]:
X_train.shape,y_train.shape

### Build a Pipeline

In [None]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

In [None]:
y_pred = text_clf.predict(X_test)

In [None]:
print(f"Accuracy: {metrics.accuracy_score(y_test,y_pred)}")
print(f"Balanced Accuracy: {metrics.balanced_accuracy_score(y_test,y_pred)}")
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test,y_pred)}")
print(f"CLassification Report:\n{metrics.classification_report(y_test,y_pred)}")

When using The text feature as a sparse matrix the results are extremely good for both the ham and spamm emails with a total of 8 emails being incorrectly classified. The precision and recall for both the majority and minority class are very good.

## Testing using Multiple Text features

In [None]:
y = encoded['class']
X = encoded.loc[:, encoded.columns != 'class']
X.head(3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

### Feature Union

In [None]:
transformer = FeatureUnion([
                ('search_term_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['subject_lemmatized'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('product_title_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['text_lemmatized'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())]))]) 


In [None]:
text_clf = Pipeline([('transformer', transformer),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

In [None]:
y_pred= text_clf.predict(X_test)

print(f"Accuracy: {metrics.accuracy_score(y_test,y_pred)}")
print(f"Balanced Accuracy: {metrics.balanced_accuracy_score(y_test,y_pred)}")
print(f"Confusion Matrix:\n{metrics.confusion_matrix(y_test,y_pred)}")
print(f"CLassification Report:\n{metrics.classification_report(y_test,y_pred)}")

When both text and subject are vectorized there is a small reduction in the accuracy of the model. This reduction would suggest that using the text alone is prefferable as it consists of a smaller matrix, which will allow for faster predictions.