In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import string
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics

# Data Loading

In [2]:
df_yelp = pd.read_table('Data/yelp_labelled.txt')
df_imdb = pd.read_table('Data/imdb_labelled.txt')
df_amz = pd.read_table('Data/amazon_cells_labelled.txt')
# Concatenate our Datasets
frames = [df_yelp,df_imdb,df_amz]

In [3]:
for column in frames: 
    column.columns = ["Message","Target"]

In [4]:
# Assign a Key to Make it Easier
keys = ['Yelp','IMDB','Amazon']
# Merge or Concat our Datasets
df = pd.concat(frames,keys=keys)
df.head()

Unnamed: 0,Unnamed: 1,Message,Target
Yelp,0,Crust is not good.,0
Yelp,1,Not tasty and the texture was just nasty.,0
Yelp,2,Stopped by during the late May bank holiday of...,1
Yelp,3,The selection on the menu was great and so wer...,1
Yelp,4,Now I am getting angry and I want my damn pho.,0


# Data Cleaning

In [5]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
#parser = English()

In [6]:
def lemmatize(doc):
    return [token.lemma_ for token in doc if not
           (token.is_punct or token.is_space or token.lower_ in STOP_WORDS)]

In [7]:
# Creating our tokenizer function. This function will lemmatize, remove stop words, remove punctuations and remove noun? proper noun?
def spacy_tokenizer(doc):
    # Creating our token object, which is used to create documents with linguistic annotations.
    #mytokens = parser(sentence)
    tokens = nlp(doc)
    #print(tokens)

    # Lemmatizing each token and converting each token into lowercase
    #tokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    tokens = [word.lemma_.lower() for word in tokens]
    #print(tokens)
    
    # Removing stop words and punctuations
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return tokens

In [None]:
sent = 'Hey my name is Roberto and I am the best!'
doc = nlp(sent)
lemmatize(doc)

In [None]:
spacy_tokenizer(sent)

# Modeling

In [8]:
from sklearn.base import TransformerMixin 

# This function will clean the text
def clean_text(text):     
    return text.strip().lower()
    
#Custom transformer using Python standard library (you could use spacy as well)
class predictors(TransformerMixin):

    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# We create our bag of words (bow) using our tokenizer and defining an ngram range
bow = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [10]:
# Features and Labels
X = df['Message']
ylabels = df['Target']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, random_state=42)

In [None]:
tf = tfvectorizer.fit_transform(X_train)

In [None]:
tf.toarray()

In [None]:
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer)], verbose=True)

In [None]:
example = ["I do enjoy my job",
 "What a poor product!,I will have to get a new one",
 "I feel amazing!",
 "This class sucks"]

In [None]:
res = pipe.fit_transform(example)

In [None]:
res.toarray()

In [12]:
#Function to plot confusion matrix.
def cmplot(cm):
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

    # labels, title and ticks
    ax.set_xlabel('Predicted labels');
    ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['0', '1']); 
    ax.yaxis.set_ticklabels(['0', '1']);

## Suporting Vector Machines

In [None]:
# SVC classifier
from sklearn.svm import LinearSVC

classifier_SVC = LinearSVC(verbose=True)

In [None]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe1_svc = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow),
                 ('classifier', classifier_SVC)], verbose=True)

pipe2_svc = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier_SVC)], verbose=True)

In [None]:
pipe1_svc.fit(X_train,y_train)
pipe2_svc.fit(X_train,y_train)

In [None]:
svc1_prediction = pipe1_svc.predict(X_test)
svc2_prediction = pipe2_svc.predict(X_test)

In [None]:
metrics.confusion_matrix(y_test, svc1_prediction)
#TN FP
#FN TP

In [None]:
cm=metrics.confusion_matrix(y_test, svc1_prediction)
cmplot(cm)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, svc1_prediction))
print("Precision:",metrics.precision_score(y_test, svc1_prediction))#Positive predictive value
print("Recall:",metrics.recall_score(y_test, svc1_prediction))#Sensitivity, hit rate, true positive rate
print("Specificity:", metrics.recall_score(y_test, svc1_prediction,pos_label=0))#Specificity, true negative rate
print("F1 Score: ",metrics.f1_score(y_test,svc1_prediction))#measure of preciseness and robustness of model. 2TP/(2TP+FP+FN)

In [None]:
cm=metrics.confusion_matrix(y_test, svc2_prediction)
cmplot(cm)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, svc2_prediction))
print("Precision:",metrics.precision_score(y_test, svc2_prediction))
print("Recall:",metrics.recall_score(y_test, svc2_prediction))
print("Specificity:", metrics.recall_score(y_test, svc2_prediction,pos_label=0))
print("F1 Score: ",metrics.f1_score(y_test,svc2_prediction))

## Logistic Regression

In [13]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier_LG = LogisticRegression(verbose=True)

In [14]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe1_LG = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow),
                 ('classifier', classifier_LG)], verbose=True)

pipe2_LG = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier_LG)], verbose=True)

In [15]:
pipe1_LG.fit(X_train,y_train)
pipe2_LG.fit(X_train,y_train)

[Pipeline] ........... (step 1 of 3) Processing cleaner, total=   0.0s
[Pipeline] ........ (step 2 of 3) Processing vectorizer, total=   8.7s
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3271     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.33154D+03    |proj g|=  6.15000D+01

At iterate   50    f=  6.80088D+02    |proj g|=  9.01651D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3271     58     61      1     0     0   6.101D-03   6.801D+02
  F =   680.08805755596006     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


[Pipeline] ........ (step 2 of 3) Processing vectorizer, total=  10.7s
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3271     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.33154D+03    |proj g|=  2.13949D+01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3271     20     24      1     0     0   1.211D-03   9.672D+02
  F =   967.22004087261587     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f202f4da1d0>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f2030c72a70>)),
                ('classifier', LogisticRegression(verbose=True))],
         verbose=True)

In [16]:
plg1_prediction = pipe1_LG.predict(X_test)
plg2_prediction = pipe2_LG.predict(X_test)

In [None]:
metrics.confusion_matrix(y_test, plg1_prediction)

In [None]:
cm=metrics.confusion_matrix(y_test, plg1_prediction)
cmplot(cm)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, plg1_prediction))
print("Precision:",metrics.precision_score(y_test, plg1_prediction))
print("Recall:",metrics.recall_score(y_test, plg1_prediction))
print("Specificity:", metrics.recall_score(y_test, plg1_prediction,pos_label=0))
print("F1 Score: ",metrics.f1_score(y_test,plg1_prediction))

In [None]:
metrics.roc_auc_score(y_test, plg1_prediction)

In [None]:
metrics.confusion_matrix(y_test, plg2_prediction)

In [None]:
cm=metrics.confusion_matrix(y_test, plg2_prediction)
cmplot(cm)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, plg2_prediction))
print("Precision:",metrics.precision_score(y_test, plg2_prediction))
print("Recall:",metrics.recall_score(y_test, plg2_prediction))
print("Specificity:", metrics.recall_score(y_test, plg2_prediction,pos_label=0))
print("F1 Score: ",metrics.f1_score(y_test,plg2_prediction))

In [None]:
metrics.roc_curve(y_test, plg2_prediction)

In [None]:
metrics.roc_auc_score(y_test, plg2_prediction)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(pipe2_LG, X, ylabels,cv=2)

In [None]:
scores

## MultiLayer Perceptron 

In [None]:
# Multi layer perceptron
from sklearn.neural_network import MLPClassifier

classifier_MLP =  MLPClassifier(max_iter=400, hidden_layer_sizes=(100,2), verbose=True)

In [None]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe1_mlp = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow),
                 ('classifier', classifier_MLP)], verbose=True)

pipe2_mlp = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier_MLP)], verbose=True)

In [None]:
pipe1_mlp.fit(X_train,y_train)
pipe2_mlp.fit(X_train,y_train)

In [None]:
mlp1_prediction = pipe1_mlp.predict(X_test)
mlp2_prediction = pipe2_mlp.predict(X_test)

In [None]:
metrics.confusion_matrix(y_test, mlp1_prediction)

In [None]:
cm=metrics.confusion_matrix(y_test, mlp1_prediction)
cmplot(cm)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, mlp1_prediction))
print("Precision:",metrics.precision_score(y_test, mlp1_prediction))
print("Recall:",metrics.recall_score(y_test, mlp1_prediction))
print("Specificity:", metrics.recall_score(y_test, mlp1_prediction,pos_label=0))
print("F1 Score: ",metrics.f1_score(y_test,mlp1_prediction))

In [None]:
metrics.confusion_matrix(y_test, mlp2_prediction)

In [None]:
cm=metrics.confusion_matrix(y_test, mlp2_prediction)
cmplot(cm)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, mlp2_prediction))
print("Precision:",metrics.precision_score(y_test, mlp2_prediction))
print("Recall:",metrics.recall_score(y_test, mlp2_prediction))
print("Specificity:", metrics.recall_score(y_test, mlp2_prediction,pos_label=0))
print("F1 Score: ",metrics.f1_score(y_test,mlp2_prediction))

In [None]:
example = ["I do enjoy my job",
 "What a poor product!,I will have to get a new one",
 "I feel amazing!",
 "This class sucks"]

#pipe2_mlp.predict(example)

# Save Model

In [18]:
import joblib

In [19]:
joblib.dump(pipe2_LG, "model/logreg_tfidf2.pkl")

['model/logreg_tfidf2.pkl']

In [None]:
logreg_loaded = joblib.load("model/logreg_tfidf.pkl")

In [None]:
logreg_loaded.predict(example)

In [None]:
joblib.dump(pipe2, "mlp_tfidf.pkl")

In [None]:
mlp_loaded = joblib.load("mlp_tfidf.pkl")

In [None]:
mlp_loaded.predict(example)