# Making a Spam classifier

    - Data source from UCI Datasource repository

In [1]:
import pandas as pd
import numpy as np

### Reading data:

In [None]:
messages_dataframe = pd.read_table(
    '../input/spam_collection', header=None, encoding='utf-8')


# Yeta column rename gare huncha, rename from 0, 1 to classes and messages ...
classes = messages_dataframe[0]
text_messages = messages_dataframe[1]
classes.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

### Preprocessing the data:

    - Encoding it to classes. (0 -> Ham, 1 -> Spam)
    - Removing things like email, phone numbers, links, punctuations etc etc
    - Making everything to lower case.

In [22]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoded_classes = encoder.fit_transform(classes)

processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                      'emailaddress')
processed = processed.str.replace(
    r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')
processed = processed.str.replace(r'£|\$', 'moneysymb')
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')
processed = processed.str.replace(r'[^\w\d\s]', ' ')
processed = processed.str.replace(r'\s+', ' ')
processed = processed.str.replace(r'^\s+|\s+?$', '')
processed = processed.str.lower()

### Removing Stopwords:

    

In [4]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
processed = processed.apply(
    lambda x: ' '.join(word for word in x.split() if word not in stop_words))

### Stemming:


In [5]:
from nltk import PorterStemmer

stemmer = PorterStemmer()
processed = processed.apply(
    lambda x: ' '.join(stemmer.stem(word) for word in x.split()))

### Feature Engineering:

> Features haru nikalney messages bata, Hamro case ma euta word euta feature huncha ... Also , called **Feature Extraction** I think.

In [6]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

all_words = []

for message in processed:
    words = word_tokenize(message)
    for word in words:
        all_words.append(word)

all_words = FreqDist(all_words)

word_features = list(all_words.keys())[:1500]

In [7]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features


messages = list(zip(processed, classes))
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

feature_sets = [(find_features(text), label) for (text, label) in messages]

### Dividing the feature set into training and testing  set:


In [8]:
from sklearn.model_selection import train_test_split

features_train, features_test = train_test_split(
    feature_sets, test_size=0.3, random_state=seed)


### Classifiers part:

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [10]:
classifier_names = [
    'Multinomial Naive Bayes', 'SVM Linear', 'Decision Trees',
    'K Nearest Neighbours', 'Random Forest', 'AdaBoost', 'Logistic Regressor',
    'SGD Classifier'
]

classifiers = [
    MultinomialNB(),
    SVC(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100)
]

all_models = zip(classifier_names, classifiers)

#### Wrapping all models into SKLearn Classifier:


In [11]:
import nltk
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in all_models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(features_train)
    accuracy = nltk.classify.accuracy(nltk_model, features_test) * 100
    print("{} Accuracy: {}".format(name, accuracy))

Multinomial Naive Bayes Accuracy: 97.66746411483254




SVM Linear Accuracy: 86.96172248803828
Decision Trees Accuracy: 95.75358851674642
K Nearest Neighbours Accuracy: 95.51435406698565




Random Forest Accuracy: 97.36842105263158
AdaBoost Accuracy: 98.02631578947368




Logistic Regressor Accuracy: 98.20574162679426




SGD Classifier Accuracy: 97.96650717703349


#### Using Voting Classifier:

> Sappai mathi ko model haru lai run garrcha, sappile vote garcha spam ho ki hoina vanera ani last ma jati dherai vote aayo tyai dincha 

In [14]:
from sklearn.ensemble import VotingClassifier

nltk_classifier_names = [
    'Multinomial Naive Bayes', 'SVM Linear', 'Decision Trees',
    'K Nearest Neighbours', 'Random Forest', 'AdaBoost', 'Logistic Regressor',
    'SGD Classifier'
]

nltk_classifiers = [
    MultinomialNB(),
    SVC(kernel='linear'),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100)
]

nltk_all_models = list(zip(nltk_classifier_names, nltk_classifiers))

vote_holder = SklearnClassifier(
    VotingClassifier(estimators=nltk_all_models, voting='hard', n_jobs=-1))
vote_holder.train(features_train)

accuracy = nltk.classify.accuracy(nltk_model, features_test) * 100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 97.96650717703349


### Playing Around

In [15]:
test_features, test_labels = zip(*features_test)

prediction = vote_holder.classify_many(test_features)

print(classification_report(test_labels, prediction))

pd.DataFrame(
    confusion_matrix(test_labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])



              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1451
        spam       0.98      0.89      0.93       221

   micro avg       0.98      0.98      0.98      1672
   macro avg       0.98      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1448,3
actual,spam,25,196
