In [1]:
import pandas as pd
import numpy as np

In [20]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
from sklearn.pipeline import Pipeline
import joblib

In [3]:
df = pd.read_csv('spam.csv', encoding="latin-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df['SMS'] = df['v2']
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})
df.drop(['v1', 'v2'], axis=1, inplace=True)
train_data = df[:4400]
test_data = df[4400:]
df.head()


Unnamed: 0,SMS,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
def perform(classifiers, vectorizers, train_data, test_data):
    max_score = 0
    max_name = 0
    for classifier in classifiers:
        for vectorizer in vectorizers:
        
            # train
            vectorize_text = vectorizer.fit_transform(train_data.SMS)
            classifier.fit(vectorize_text, train_data.label)

            # score
            vectorize_text = vectorizer.transform(test_data.SMS)
            score = classifier.score(vectorize_text, test_data.label)
            name = classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__ 
            print(name, score)
        if score > max_score:
            max_score = score
            max_name = name
    print ('===========================================')
    print ('===========================================')
    print (max_name, max_score)
    print ('===========================================')
    print ('===========================================')

In [6]:
classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ]

In [7]:
vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ]

In [8]:
perform(
    classifiers,
    vectorizers,
    train_data,
    test_data
)

BernoulliNB with CountVectorizer 0.9778156996587031
BernoulliNB with TfidfVectorizer 0.9778156996587031
BernoulliNB with HashingVectorizer 0.8728668941979523
RandomForestClassifier with CountVectorizer 0.9761092150170648
RandomForestClassifier with TfidfVectorizer 0.9761092150170648
RandomForestClassifier with HashingVectorizer 0.9607508532423208
AdaBoostClassifier with CountVectorizer 0.9718430034129693
AdaBoostClassifier with TfidfVectorizer 0.9692832764505119
AdaBoostClassifier with HashingVectorizer 0.9735494880546075
BaggingClassifier with CountVectorizer 0.9641638225255973
BaggingClassifier with TfidfVectorizer 0.9633105802047781
BaggingClassifier with HashingVectorizer 0.9692832764505119
ExtraTreesClassifier with CountVectorizer 0.9803754266211604
ExtraTreesClassifier with TfidfVectorizer 0.976962457337884
ExtraTreesClassifier with HashingVectorizer 0.9701365187713311
GradientBoostingClassifier with CountVectorizer 0.9709897610921502
GradientBoostingClassifier with TfidfVectoriz

KeyboardInterrupt: 

In [9]:
Classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))
Vectorizer = TfidfVectorizer()
vectorize_text = Vectorizer.fit_transform(train_data.SMS)
Classifier.fit(vectorize_text, train_data.label)

OneVsRestClassifier(estimator=SVC(kernel='linear', probability=True))

In [11]:

SMS = ' won a 1 week FREE membership in our $100,000 Prize Jackpot! Txt the word: C'
vectorize_message = Vectorizer.transform([SMS])
predict = Classifier.predict(vectorize_message)[0]
predict

1

In [16]:
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', OneVsRestClassifier(SVC(kernel='linear', probability=True)))])

# fit the pipeline model with the training data                            
pipeline.fit(train_data.SMS, train_data.label)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           

In [18]:
text = ["won a 1 week FREE membership in our $100,000 Prize Jackpot! Txt the word: C"]

# predict the label using the pipeline
pipeline.predict(text)[0]

1

In [21]:
joblib.dump(pipeline, 'spam_detection_pipeline.pkl')

['spam_detection_pipeline.pkl']

In [25]:
text = ["You have won a 1 month FREE membership for our $80,000 Prize Jackpot! Reply with : Received"]

In [26]:
pipeline1 = joblib.load('spam_detection_pipeline.pkl')
pipeline1.predict(text)[0]

1

In [31]:
from flask import Flask,render_template,url_for,request, jsonify
from sklearn.pipeline import Pipeline
import joblib
from flask import jsonify

app = Flask(__name__)
global pipeline

pipeline = joblib.load('spam_detection_pipeline.pkl')
print('Pipeline loaded')

@app.route('/predict',methods=['POST'])
def predict():
    if request.method == 'POST':
        message = request.form['message']
        try:
            result = None
            if len(message) > 0:
                result = pipelineline.predict([message])[0]
            data = {'spam': result}
        except Exception as e:
            print(str(e))
    return jsonify(data)

if __name__ == '__main__':
    app.run(use_reloader=True)

Pipeline loaded
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Restarting with stat


SystemExit: 1