In [1]:
import pandas as pd
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.pipeline import Pipeline, FeatureUnion, make_union

In [2]:
import os
os.chdir('C:\Users\Sarick\Documents\Python Scripts\SpamHam\app')
df = pd.read_csv("SMSSpamCollection", sep="\t", 
                 header=None, 
                 names=["target", "text"])

In [3]:
X = df["text"]
y = df["target"]
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: target, dtype: object

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class CapitalDocTransformer(BaseEstimator, TransformerMixin):
    
    #Transforms the input document to either 1 or 0. 
    #1 if all words in the document are capital, else 0.
    
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        #if all the words in each line of X is A-Z only, then return 1. Else return 0.
        
        X_new = np.array([line == line.upper() for line in X]).astype(int)
        return X_new.reshape(-1,1)

In [6]:
cp_transformer = CapitalDocTransformer()
cp_transformer.fit_transform(X)

array([[0],
       [0],
       [0],
       ..., 
       [0],
       [0],
       [0]])

In [7]:
# Build a Model
log_reg_model = Pipeline(steps=[
        ("features", make_union(CapitalDocTransformer(), CountVectorizer())), 
        ("model", LogisticRegression())
        ])

In [8]:
log_reg_model.fit(X_train, y_train)


Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('capitaldoctransformer', CapitalDocTransformer(columns=None)), ('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
   ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [9]:
import pickle
pickle.dump(log_reg_model, open( "spamham_model.pkl", "wb" ))
#log_reg_model = pickle.load(open("spamham_model.pkl", "rb"))

In [10]:
from sklearn.externals import joblib
joblib.dump(log_reg_model, 'spam_ham.pkl')

['spam_ham.pkl']

In [11]:
new_model = joblib.load('spam_ham.pkl')

In [12]:
new_model.predict(["can't wait for this electrion to be over"])

array(['ham'], dtype=object)

In [None]:
#Need a python file for this process so python build_model.py from the app folder