In [1]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
data = pd.read_csv("spamham.csv")
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\etern\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Label,Message,label_num,message,class
0,605.0,ham,Subject: enron methanol ; meter # : 988291\nth...,0.0,,
1,2349.0,ham,"Subject: hpl nom for january 9 , 2001\n( see a...",0.0,,
2,3624.0,ham,"Subject: neon retreat\nho ho ho , we ' re arou...",0.0,,
3,4685.0,spam,"Subject: photoshop , windows , office . cheap ...",1.0,,
4,2030.0,ham,Subject: re : indian springs\nthis deal is to ...,0.0,,


In [3]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
corpus = []

# Optional: Drop rows with missing values in 'Message'
data = data.dropna(subset=['Message'])

# Make sure all values are strings
data['Message'] = data['Message'].astype(str)

for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['Message'][i])  # Remove non-letters
    review = review.lower()                                # Lowercase
    review = review.split()                                # Tokenize
    
    # Remove stopwords and apply stemming
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)                              # Re-join words
    corpus.append(review)


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

In [5]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
y = data['Label'].map({'ham':0, 'spam':1})

In [7]:
y

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: Label, Length: 5171, dtype: int64

## Grid Search

- Why do we use Grid Search?

`GridSearchCV` is a technique to search through the best parameter values from the given set of the grid of parameters. It is basically a cross-validation method. the model and the parameters are required to be fed in. Best parameter values are extracted and then the predictions are made.

## Select the best model
- so here we have some list of the best text classification algorithms we imported. Now we will compare each model's score and see which model is performing better than rest of the others

### 1. Multinomial Naive Bayes Classifier

The multinomial NB classifier has a hyperparameter called **`alpha`**. It is the **smoothing parameter** to avoid **zero counts** when calculating the frequencies. 

For example, if we are now classifying a new SMS with a word "ryan" which never exist in the spam emails within our training dataset, the **likelihood** for this word will be zero. This will casue the **overall likelihood** to be zero (because we take the product of all **individual likelihoods**) for no matter what class of output variable we have.

Therefore, we need to add **additional counts** to each word when calculating the frequencies to avoid have a zero likelihood value. **Alpha** indicates how many **additional counts** we add.

### 2. Gaussian Naive Bayes Classifier

There is one hyperparameter we need to tune: **`var_smoothing`**. This is the **portion of the largest variance** of all features that is added to variances for **calculation stability**.

### 3. SVC
SVC, or Support Vector Classifier, is a supervised machine learning algorithm typically used for classification tasks. SVC works by mapping data points to a high-dimensional space and then finding the optimal hyperplane that divides the data into two classes.

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Gaussian Naive Bayes": GaussianNB(),
    "SVC": SVC()
}

- ### We will create a generic function to check each model's performance so that we can compare those

In [9]:
# Create a function which can evaluate models and return a report 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    

    models_list = []
    scores = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_pred = model.predict(X_test)

        score = accuracy_score(y_test,y_pred)
        
        model_name = list(models.keys())[i]
        print(f'---- score for --- {model_name} ----')
        print(f"{score}")
        models_list.append(model_name)
        scores.append(score)
    
    print()
    
    report = pd.DataFrame()
    report['Model_name'] = models_list
    report['Score'] = scores        
    return report

In [10]:
report = evaluate_models(x, y, models)

---- score for --- Multinomial Naive Bayes ----
0.9777777777777777
---- score for --- Gaussian Naive Bayes ----
0.9526570048309179
---- score for --- SVC ----
0.9594202898550724



In [11]:
report.sort_values('Score')

Unnamed: 0,Model_name,Score
1,Gaussian Naive Bayes,0.952657
2,SVC,0.95942
0,Multinomial Naive Bayes,0.977778


- ### From the report above we can see that the Multinomial Naive Bayes model performed the best, so we will continue training our model using Multinomial Naive Bayes algorithm.

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [17]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np

params = {
    'alpha': np.linspace(0.1, 1.0, 10)  
}
mnb_model = MultinomialNB()
mnb_cv = GridSearchCV(mnb_model, params, cv = 5)
mnb_cv.fit(X_train, y_train)

print("tuned hpyerparameters :(best parameters) ",mnb_cv.best_params_)
print("accuracy :",mnb_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'alpha': 0.1}
accuracy : 0.9779971259835621


In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score

spam_detect_model = MultinomialNB(**mnb_cv.best_params_)
spam_detect_model.fit(X_train, y_train)
y_pred = spam_detect_model.predict(X_test)
confusion_m = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the model is {accuracy}")
print(f"The confusion matrix is: \n{confusion_m}")



Accuracy of the model is 0.9768115942028985
The confusion matrix is: 
[[728  14]
 [ 10 283]]


- So we can see that the model performed well and we have got an accuracy of 96% which is pretty insane. In our project we will be having all these models and we will be selecting the models based on the performence.