Sentiment Analysis Model

# Data Fetching

In [None]:
import os
data_path = 'data'
df_yelp = pd.read_table(os.path.join(data_path,'yelp_labelled.txt'))
df_imdb = pd.read_table(os.path.join(data_path,'imdb_labelled.txt'))
df_amz = pd.read_table(os.path.join(data_path,'amazon_cells_labelled.txt'))

In [6]:
import pandas as pd

# Load our dataset
df_yelp = pd.read_table('yelp_labelled.txt')
df_imdb = pd.read_table('imdb_labelled.txt')
df_amz = pd.read_table('amazon_cells_labelled.txt')

In [7]:
# Concatenate our Datasets
frames = [df_yelp,df_imdb,df_amz]
frames

[                              Wow... Loved this place.  1
 0                                   Crust is not good.  0
 1            Not tasty and the texture was just nasty.  0
 2    Stopped by during the late May bank holiday of...  1
 3    The selection on the menu was great and so wer...  1
 4       Now I am getting angry and I want my damn pho.  0
 ..                                                 ... ..
 994  I think food should have flavor and texture an...  0
 995                           Appetite instantly gone.  0
 996  Overall I was not impressed and would not go b...  0
 997  The whole experience was underwhelming, and I ...  0
 998  Then, as if I hadn't wasted enough of my life ...  0
 
 [999 rows x 2 columns],
     A very, very, very slow-moving, aimless movie about a distressed, drifting young man.    \
 0    Not sure who was more lost - the flat characte...                                        
 1    Attempting artiness with black & white and cle...                  

In [None]:
# Renaming Column Headers
for colname in frames:
    colname.columns = ["Message","Target"]
frames

In [9]:
# Assign a Key to Make it Easier
keys = ['Yelp','IMDB','Amazon']

In [10]:
# Merge or Concat our Datasets
df = pd.concat(frames,keys=keys)
df

Unnamed: 0,Unnamed: 1,Message,Target
Yelp,0,Crust is not good.,0
Yelp,1,Not tasty and the texture was just nasty.,0
Yelp,2,Stopped by during the late May bank holiday of...,1
Yelp,3,The selection on the menu was great and so wer...,1
Yelp,4,Now I am getting angry and I want my damn pho.,0
...,...,...,...
Amazon,994,The screen does get smudged easily because it ...,0
Amazon,995,What a piece of junk.. I lose more calls on th...,0
Amazon,996,Item Does Not Match Picture.,0
Amazon,997,The only thing that disappoint me is the infra...,0


In [11]:
df.to_csv("sentimentdataset.csv")

# Cleaning Dataset

In [12]:
# Checking for Missing Values
df.isnull().sum()

Message    0
Target     0
dtype: int64

In [13]:
# Checking for the balance of our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2745 entries, ('Yelp', 0) to ('Amazon', 998)
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Message  2745 non-null   object
 1   Target   2745 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 91.2+ KB


In [14]:
# Checking for the balance of our dataset
df.Target.value_counts()

1    1385
0    1360
Name: Target, dtype: int64

In [78]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    #mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [73]:
from sklearn.base import TransformerMixin

# This function will clean the text
def clean_text(text):
    return text.strip().lower()

#Custom transformer using Python standard library (you could use spacy as well)
class predictors(TransformerMixin):

    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Feature Extraction

Create the TFIDF vectorizer

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [89]:
# Splitting Data Set
from sklearn.model_selection import train_test_split

# Features and Labels
X = df['Message']
ylabels = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, random_state=42)

In [90]:
# Tf-Idf transformation
xtrain_tfidf = tfvectorizer.fit_transform(X_train)
xtest_tfidf = tfvectorizer.transform(X_test)



# Train Model

In [96]:
perform_list = []

In [97]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score


def run_model(model_name):
    mdl = ""
    if model_name == 'Logistic Regression':
        mdl = LogisticRegression(verbose=True)
    elif model_name == 'Linear SVC':
        mdl = LinearSVC(verbose=True)

    mdl.fit(xtrain_tfidf, y_train)
    y_pred = mdl.predict(xtest_tfidf.toarray())
    # Performance metrics
    accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
    # Get precision, recall, f1 scores
    precision, recall, f1score, support = score(y_test, y_pred, average='micro')
    print(f'Test Accuracy Score of Basic {model_name}: % {accuracy}')
    print(f'Precision : {precision}')
    print(f'Recall : {recall}')
    print(f'F1-score : {f1score}')
    # Add performance parameters to list
    perform_list.append(dict([('Model', model_name),('Test Accuracy', round(accuracy, 2)),('Precision', round(precision, 2)),('Recall', round(recall, 2)),('F1', round(f1score, 2))]))

In [98]:
run_model('Linear SVC')

[LibLinear]Test Accuracy Score of Basic Linear SVC: % 79.85
Precision : 0.7985436893203883
Recall : 0.7985436893203883
F1-score : 0.7985436893203883


In [99]:
run_model('Logistic Regression')

Test Accuracy Score of Basic Logistic Regression: % 81.67
Precision : 0.816747572815534
Recall : 0.816747572815534
F1-score : 0.816747572815534


In [100]:
model_performance = pd.DataFrame(data=perform_list)
model_performance = model_performance[['Model', 'Test Accuracy', 'Precision', 'Recall', 'F1']]
model_performance

Unnamed: 0,Model,Test Accuracy,Precision,Recall,F1
0,Linear SVC,79.85,0.8,0.8,0.8
1,Logistic Regression,81.67,0.82,0.82,0.82


# Pipeline Creation

Create a pipeline that:

*   Clean and process the text using our predictors class
*   Vectorize the words with TF-IDF to create word matrixes.
*   Load the classifier



In [101]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [105]:
# Create the  pipeline to clean, tokenize, vectorize, and classify
text_clf = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', LogisticRegression())], verbose=True)

In [106]:
# Fit our data
text_clf.fit(X_train,y_train)

[Pipeline] ........... (step 1 of 3) Processing cleaner, total=   0.0s
[Pipeline] ........ (step 2 of 3) Processing vectorizer, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.1s


In [107]:
from pickle import dump, load
model_file = "logreg_tfidf.pkl"
dump(text_clf, open(model_file, 'wb'))

# Load and Predict

In [108]:
import joblib

model = joblib.load("logreg_tfidf.pkl")

In [109]:
example = ["I do enjoy my job",
 "What a poor product!,I will have to get a new one",
 "I feel amazing!",
 "This class sucks"]

model.predict(example)

array([1, 0, 1, 0])