## Model Building and Ensemble Voting Classifier

### Overview
In this file, I focused on building machine learning models for SMS spam classification. The tasks included:

1. **Pipeline Creation**
   - Created a scikit-learn pipeline for text preprocessing and model training to streamline the workflow.

2. **Model Comparison**
   - Implemented Naive Bayes (NB), Support Vector Classifier (SVC), and Logistic Regression (LR) models.
   - Used TFIDF, BOW (Bag-of-Words), and NGrams vectorizers to convert text data into numerical features.
   - Compared the performance of these models using appropriate evaluation metrics.

3. **Ensemble Voting Classifier**
   - Combined multiple models into an ensemble voting classifier to improve prediction accuracy.
   - Used the ensemble classifier to make predictions based on aggregated results from individual models.

4. **Model Export**
   - Exported the final trained model for future use or deployment.



In [1]:
import numpy as np
import pandas as pd

In [12]:
from tqdm.auto import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

import nltk
import re, string
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [87]:
df = pd.read_csv("data/spam_cleaned.csv")
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [88]:
df["target"] = df["target"].replace({
    "ham": 0,
    "spam": 1
})

  df["target"] = df["target"].replace({


In [89]:
df.sample()

Unnamed: 0,target,text
212,0,Finally the match heading towards draw as your...


## Text Preprocessor

In [90]:
def text_preprocessing(x):
    x_copy = x.copy()

    stemmer = PorterStemmer()
    stopwords = nltk.corpus.stopwords.words('english')    
    def preprocess(text):
        # Convert to lower case
        text = text.lower()        
        # keep only english chars / remove numbers from ducuments
        text = re.sub(r'[^a-zA-Z+]', ' ', text)        
        # Removing puctuations from all ducuments
        text = "".join([char for char in text if char not in string.punctuation])   
        
        # Remove stop words and stemming
        text = " ".join([stemmer.stem(word) for word in nltk.word_tokenize(text) if word not in stopwords and len(word) > 2])   
        
        # Removing repeated/leading/trailing spaces
        text = re.sub("\s[\s]+", " ",text).strip()
        return text

    x_copy = x_copy.apply(preprocess)
    return x_copy

TextPreprocess = FunctionTransformer(text_preprocessing)

In [91]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import make_scorer, precision_score, accuracy_score

## Comparing models with Different Vectorizers

In [92]:
# Vectorizers
vectorizers = {
    "tf_idf": TfidfVectorizer(max_features=3000),
    "bow": CountVectorizer(max_features=3000),
    "n_grams": CountVectorizer(max_features=3000, ngram_range=(1,2))
}
# Models
models = {
    "gnb": GaussianNB(),
    "mnb": MultinomialNB(),
    "bnb": BernoulliNB(),
    "svc": SVC(kernel='sigmoid', gamma=1.0),
    "lr": LogisticRegression(solver='liblinear', penalty='l1')
}

In [93]:
res = []
# Define scoring metrics
scoring = {'accuracy': 'accuracy', 'precision': 'precision'}

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

pbar = tqdm(total=len(vectorizers) * len(models))
for vectorizer_name, vectorizer in vectorizers.items():
    
    for model_name, model in models.items():
        pbar.set_description(f"{vectorizer_name} - {model_name}")
        pbar.update(1)
        
        pipeline = Pipeline([
            ('text_preprocessing', TextPreprocess),
            (vectorizer_name, vectorizer),
            ("dense_array", FunctionTransformer(lambda x: x.toarray() if hasattr(x, 'toarray') else x)),
            (model_name, model)
        ])
        try:
            cv_results = cross_validate(pipeline, X_train, y_train, cv=4, scoring=scoring, return_train_score=False)
            res.append([vectorizer_name, model_name, cv_results['test_accuracy'].mean(), cv_results["test_precision"].mean()])
        except ValueError as e:
            print(f"Error with vectorizer {vectorizer_name} and model {model_name} - {e}")
pbar.close()

result = pd.DataFrame(res, columns=["vectorizer", "model", "accuracy", "precision"])

  0%|          | 0/15 [00:00<?, ?it/s]

## Training Result

In [94]:
result.set_index(["model", "vectorizer"]).unstack().style.background_gradient()

Unnamed: 0_level_0,accuracy,accuracy,accuracy,precision,precision,precision
vectorizer,bow,n_grams,tf_idf,bow,n_grams,tf_idf
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bnb,0.974607,0.970253,0.974607,0.980929,0.992268,0.980929
gnb,0.846672,0.840384,0.84377,0.43741,0.42849,0.430129
lr,0.974363,0.974122,0.955499,0.960876,0.961011,0.921487
mnb,0.978961,0.980411,0.961306,0.920678,0.946054,0.986047
svc,0.926965,0.92745,0.973397,0.725954,0.725772,0.956315


In [98]:
result.sort_values("precision", ascending=False)

Unnamed: 0,vectorizer,model,accuracy,precision
12,n_grams,bnb,0.970253,0.992268
1,tf_idf,mnb,0.961306,0.986047
2,tf_idf,bnb,0.974607,0.980929
7,bow,bnb,0.974607,0.980929
14,n_grams,lr,0.974122,0.961011
9,bow,lr,0.974363,0.960876
3,tf_idf,svc,0.973397,0.956315
11,n_grams,mnb,0.980411,0.946054
4,tf_idf,lr,0.955499,0.921487
6,bow,mnb,0.978961,0.920678


## Ensembling

In [123]:
def to_array(x):
    return x.toarray() if hasattr(x, "toarray") else x

ToArray = FunctionTransformer(to_array)

def create_pipeline(vecorizer, model):
    return Pipeline([
        ('text_preprocessing', TextPreprocess),
        ("vectorizer", vectorizer),
        ("dense_array", ToArray),
        ("model", model)
    ])

1. **BNB with NGrams**
2. **MNB with TFIDF**
3. **SVC with TFIDF**

In [124]:
n_grams_bnb = create_pipeline(
    CountVectorizer(max_features=3000, ngram_range=(1,2)), BernoulliNB()
)
tfidf_mnb = create_pipeline(
    TfidfVectorizer(max_features=3000), MultinomialNB()
)
tfidf_svc = create_pipeline(
    TfidfVectorizer(max_features=3000), SVC(kernel='sigmoid', gamma=1.0,probability=True)
)

In [125]:
estimators = [("bnb", n_grams_bnb), ('svc', tfidf_svc), ('mnb', tfidf_mnb)]

## Voting Classfifier

In [126]:
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=estimators, voting='soft')
voting.fit(X_train,y_train)

## Results on unseen data

In [127]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

Accuracy 0.9796905222437138
Precision 0.984375


## Exporting

In [128]:
import pickle
pickle.dump(voting, open('data/model.pkl','wb'))