## Import Libraries

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from string import punctuation
import numpy as np
from sklearn import preprocessing

## Load data 

In [2]:
path= r"E:\Machine Learning\data\Spam_data.csv"

spam_data=pd.read_csv(path)

In [3]:
spam_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
spam_data.count()

Category    5572
Message     5572
dtype: int64

## Separate Input Variable (also called independent variable (X)) and dependent variable (y)

In [5]:
X,y = spam_data['Message'].to_frame(),spam_data['Category'].to_frame()

In [6]:
X.head()

Unnamed: 0,Message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
X.count()

Message    5572
dtype: int64

In [8]:
y.head()

Unnamed: 0,Category
0,ham
1,ham
2,spam
3,ham
4,ham


In [9]:
y.count()

Category    5572
dtype: int64

## Text Preprocessing Pipeline

In [10]:
lemmatizer = WordNetLemmatizer()

def pre_process_message(message):
    
    # tokenize the message
    words = nltk.word_tokenize(message)
    
    # remove stopwords
    cleaned_words = [word for word in words if word not in set(stopwords.words('english'))]
    
    # remove special characters
    filtered_words = [word for word in cleaned_words if word not in set(punctuation)]
    
    # remove words with length<=3
    
    pruned_words = [word for word in filtered_words if len(words)>3]
    
    # lemmatize the tokens
    
    lemmas = [lemmatizer.lemmatize(word) for word in pruned_words]
    
    return ' '.join(lemmas)


#message='spam classifier is a huge pipeline. The pipeline identification number is 23456. We need to complete it tomorrow'

#pre_process_message(message)

X['processed_sentences']=X['Message'].map(pre_process_message)
    

In [11]:
X.count()

Message                5572
processed_sentences    5572
dtype: int64

In [12]:
y.count()

Category    5572
dtype: int64

## Split the data into training and test sets.


X_train, y_train will be used for training the model.
X_test will be used for calculating predictions on unseen data.
y_test will be used to evaluate the model.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X['processed_sentences'],y,test_size=0.3,random_state=0)

## Feature Engineering

### 1. Convert processed sentences to vectors

In [14]:


tfidf_vect = TfidfVectorizer(max_features=1500,min_df=5,max_df=0.7)
tfidf_model=tfidf_vect.fit(X['processed_sentences'])


X_train_vectors = tfidf_model.transform(X_train)
X_test_vectors = tfidf_model.transform(X_test)


In [15]:
X_train_vectors

<3900x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 28761 stored elements in Compressed Sparse Row format>

### 2. Convert class labels to indexes

In [16]:
le = preprocessing.LabelEncoder()
le.fit(y)

y_train_indexes = le.transform(y_train)
y_test_indexed = le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [17]:
le.classes_

array(['ham', 'spam'], dtype=object)

In [18]:
le.inverse_transform([0,1])

array(['ham', 'spam'], dtype=object)

In [19]:
y_train_indexes

array([0, 0, 0, ..., 1, 0, 0])

## Model Building

In [20]:
# We will use RandomForrest Classifier

classifier = RandomForestClassifier(n_estimators=100, random_state=0)

classifier = classifier.fit(X_train_vectors,y_train_indexes)

In [21]:

# run predictions on unseen data

y_pred=classifier.predict(X_test_vectors)

## Evaluate the model

In [22]:
print(confusion_matrix(y_test_indexed,y_pred))  
print(classification_report(y_test_indexed,y_pred))  
print(accuracy_score(y_test_indexed, y_pred)) 

[[1449    2]
 [  29  192]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1451
           1       0.99      0.87      0.93       221

    accuracy                           0.98      1672
   macro avg       0.99      0.93      0.96      1672
weighted avg       0.98      0.98      0.98      1672

0.9814593301435407


## Save the models

1. Vectorizer

In [23]:
with open(r'E:\Personal\ML_Journey\model\vectorizer.pkl', 'wb') as model:  
    pickle.dump(tfidf_model,model)

2. Classifier

In [24]:
with open(r'E:\Personal\ML_Journey\model\spam_classifier.pkl', 'wb') as model:  
    pickle.dump(classifier,model)

## Load Models and test them.

In [25]:
# we use pickel library

import pickle

vectorizer_path=r'E:\Personal\ML_Journey\model\vectorizer.pkl'
classifier_path=r'E:\Personal\ML_Journey\model\spam_classifier.pkl'


vectorizer = pickle.load(open(vectorizer_path, 'rb'))

classifier = pickle.load(open(classifier_path, 'rb'))

In [26]:
processed_text = pre_process_message("Exciting offer!!!! Hurray you have won 10000 dollars. Click here to claim your reward.")

In [27]:
processed_text

'Exciting offer Hurray 10000 dollar Click claim reward'

In [28]:
vectorized_msg = vectorizer.transform([processed_text])

In [29]:
vectorized_msg

<1x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [30]:
prediction = classifier.predict(vectorized_msg)

In [31]:
prediction

array([1])