In [1]:
# Imports
import pandas as pd
import numpy as np
import nltk
import string
import re
from pickle import dump, load

## Preprocessing

In [None]:
url = "https://uom-twitter-sentiment-analysis.s3.us-east-2.amazonaws.com/Twitter_sentiment_clean.csv"
data = pd.read_csv(url)

In [None]:
data.head(50)

In [28]:
# Remove punctuation
def remove_punct(text):
    new_text = []
    for t in text:
        if t not in string.punctuation:
            new_text.append(t)
    return ''.join(new_text)

In [29]:
# Tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [30]:
# Removing stop words
from nltk.corpus import stopwords

def remove_sw(text):
    new_text = []
    for t in text:
        if t not in stopwords.words('english'):
            new_text.append(t)
    return new_text

In [31]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    new_text = []
    for t in text:
        lem_text = lemmatizer.lemmatize(t)
        new_text.append(lem_text)
    return new_text

In [None]:
df_1 = data.iloc[:10000,:]
df_1.head(50)

In [None]:
df_1['text'] = df_1['text'].apply(lambda t: remove_punct(t))
df_1['text'] = df_1['text'].apply(lambda t: tokenizer.tokenize(t.lower()))
df_1['text'] = df_1['text'].apply(lambda t: remove_sw(t))

In [None]:
data['text'] = data['text'].apply(lambda t: remove_punct(t))
data['text'] = data['text'].apply(lambda t: tokenizer.tokenize(t.lower()))
data['text'] = data['text'].apply(lambda t: remove_sw(t))

In [None]:
data['text'] = data['text'].apply(lambda t: word_lemmatizer(t))

In [None]:
df_1['text']

## Split Cleaned Data into Training Set and Testing Set

In [2]:
# Read preprocessed csv
url = "https://uom-twitter-sentiment-analysis.s3.us-east-2.amazonaws.com/Lemmatize.csv"
lemmatized_df = pd.read_csv(url)
lemmatized_df.head()

Unnamed: 0,target,text
0,0,"['ahhh', 'hope', 'ok']"
1,0,"['cool', 'tweet', 'apps', 'razr', '2']"
2,0,"['know', 'family', 'drama', 'lamehey', 'next',..."
3,0,"['school', 'email', 'wont', 'open', 'geography..."
4,0,"['upper', 'airway', 'problem']"


In [3]:
X = lemmatized_df['text']
y = lemmatized_df['target']

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

TfidV = TfidfVectorizer()
X = TfidV.fit_transform(X)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

## Multinomial Naive Bayes Model

In [6]:
# Multinomial Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB

nbc = MultinomialNB()
nbc.fit(X_train, y_train)

MultinomialNB()

In [7]:
y_predict_nbc = nbc.predict(X_test)

In [8]:
from sklearn.metrics import confusion_matrix, f1_score

cm_nbc = confusion_matrix(y_test, y_predict_nbc)

f1_nbc = f1_score(y_test, y_predict_nbc)
print(f'F1 Score: {f1_nbc}')

training_score = nbc.score(X_train, y_train)
print(f'Training Score: {training_score}')

testing_score = nbc.score(X_test, y_test)
print(f'Testing Score: {testing_score}')

F1 Score: 0.760176194374941
Training Score: 0.8103716666666667
Testing Score: 0.7649325


In [9]:
# Print confusion matrix
cm_nbc

array([[156953,  43020],
       [ 51007, 149020]])

In [None]:
# Testing nbc model for best parameters
from sklearn.model_selection import GridSearchCV
parameters = {'alpha': [1, 0.1, 0.001, 0.0001, 0.00001, 0.000001], 'fit_prior' :[True, False]}
gs_nbc = GridSearchCV(nbc, parameters, n_jobs = -1)
gs_nbc = gs_nbc.fit(X_train, y_train)

In [None]:
gs_nbc.best_score_

In [None]:
gs_nbc.best_params_

## SGD Model

In [10]:
# SGD Model
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(alpha = 1e-06, penalty = 'l2', loss = 'log')
clf.fit(X_train, y_train)

SGDClassifier(alpha=1e-06, loss='log')

In [11]:
y_predict_clf = clf.predict(X_test)

In [12]:
from sklearn.metrics import confusion_matrix, f1_score

cm_clf = confusion_matrix(y_test, y_predict_clf)

f1_clf = f1_score(y_test, y_predict_clf)
print(f'F1 Score: {f1_clf}')

training_score = clf.score(X_train, y_train)
print(f'Training Score: {training_score}')

testing_score = clf.score(X_test, y_test)
print(f'Testing Score: {testing_score}')

F1 Score: 0.7878897885397937
Training Score: 0.80639
Testing Score: 0.7838125


In [None]:
# Testing clf model for best parameters
from sklearn.model_selection import GridSearchCV
parameters = {'alpha': [1, 0.1, 0.001, 0.0001, 0.00001, 0.000001], 'fit_prior' :[True, False], 'class_prior': ['optimal'], 'loss': ['log_loss', 'hinge', 'perceptron', 'modified_huber']}
gs_clf = GridSearchCV(nbc, parameters, n_jobs = -1)
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
gs_clf.best_score_

In [None]:
gs_clf.best_params_

## Logistic Model

In [13]:
# Logistic Model
from sklearn.linear_model import LogisticRegression

lrc = LogisticRegression()
lrc.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [14]:
y_predict_lrc = lrc.predict(X_test)

In [15]:
from sklearn.metrics import confusion_matrix, f1_score

cm_lrc = confusion_matrix(y_test, y_predict_lrc)

f1_lrc = f1_score(y_test, y_predict_lrc)
print(f'F1 Score: {f1_lrc}')

training_score = lrc.score(X_train, y_train)
print(f'Training Score: {training_score}')

testing_score = lrc.score(X_test, y_test)
print(f'Testing Score: {testing_score}')

F1 Score: 0.7883377878540611
Training Score: 0.7958616666666667
Testing Score: 0.783405


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {"penalty":["l1","l2"], 'C': np.logspace(-3,3,7), 'solver'  : ['newton-cg', 'lbfgs', 'liblinear']}
gs_lrc = GridSearchCV(lrc, parameters)
gs_lrc.fit(X_train, y_train)

In [None]:
gs_lrc.best_score_

In [None]:
gs_lrc.best_params_

## LGB Model

In [16]:
# LGB Model
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(scale_pos_weight = 3)
lgb.fit(X_train, y_train)

LGBMClassifier(scale_pos_weight=3)

In [17]:
y_predict_lgb = lgb.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix, f1_score

cm_lgb = confusion_matrix(y_test, y_predict_lgb)

f1_lgb = f1_score(y_test, y_predict_lgb)
print(f'F1 Score: {f1_lgb}')

training_score = lgb.score(X_train, y_train)
print(f'Training Score: {training_score}')

testing_score = lgb.score(X_test, y_test)
print(f'Testing Score: {testing_score}')

F1 Score: 0.7380586789454163
Training Score: 0.6591325
Testing Score: 0.6592425


In [None]:
# Testing lgb model for best parameters
from sklearn.model_selection import GridSearchCV
parameters = {'num_leaves': [32, 31, 30, 29, 28], 'max_depth' :[-1, -2, -3, -4, -5, -6], 'n_estimators': [100, 200, 500, 1000], 'learning_rate': [.1, .25, .5, .01, .001]}
gs_lgb = GridSearchCV(lgb, parameters, n_jobs=-1, verbose=2)
gs_lgb = gs_lgb.fit(X_train, y_train)

## Saving the Vectorizer and Models

In [19]:
# Save the scaling function to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
print('Saving the vectorizer')
dump(TfidV, open('TfidV.pkl', 'wb'))

Saving the vectorizer


In [20]:
# Save the model to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
print('Saving the model')
dump(clf, open('clf.pkl', 'wb'))

Saving the model


In [21]:
# Save the model to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
print('Saving the model')
dump(lrc, open('lrc.pkl', 'wb'))

Saving the model


In [39]:
# Save the model to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
print('Saving the model')
dump(nbc, open('nbc.pkl', 'wb'))

Saving the model


In [40]:
# Save the model to a pickle file (i.e., "pickle it")
# so we can use it from the Flask server. 
print('Saving the model')
dump(lgb, open('lgb.pkl', 'wb'))

Saving the model


## Predictions

In [22]:
# Define prediction labels.
predict_labels = ['Negative', 'Positive']

In [23]:
# Load the SGD model.
clf1 = load(open('clf.pkl', 'rb'))

In [24]:
# Load the logistic model.
lrc1 = load(open('lrc.pkl', 'rb'))

In [25]:
# Load the vectorizer.
TfidV1 = load(open('TfidV.pkl', 'rb'))

In [26]:
# 1. Create three diffferent sets of inputs (i.e., three
# different irises). Note that each set is constructed 
# as a list inside of another list (or an array inside of
# another array). This is how scikit-learn needs it. 
input_row = ["We have two working models. YAY!!"]

In [32]:
#2. Tokenizing etc
input_row = remove_punct(input_row)
input_row = tokenizer.tokenize(input_row.lower())
input_row = remove_sw(input_row)
input_row = word_lemmatizer(input_row)

In [33]:
# 3. Transform each input using the scaler function.
try:
    input_row_vectorized = TfidV1.transform(input_row)
except:
    print("An error has occurred. Please input another tweet.")

In [34]:
# 4a. Make a prediction with SGD Model
print('Making predictions: ')
predict = clf1.predict(input_row_vectorized)
print(f'> Prediction: {predict_labels[predict[0]]}')

Making predictions: 
> Prediction: Positive


In [35]:
# 4a. Make a prediction with Logistic Model
print('Making predictions: ')
predict = lrc1.predict(input_row_vectorized)
print(f'> Prediction: {predict_labels[predict[0]]}')

Making predictions: 
> Prediction: Positive


In [36]:
#5a. Calculate individual probabilities for the SGD Model prediction
prob = (clf1.predict_proba(input_row_vectorized)[0] *100)

probability_clf = []

for element in prob:
    prob_0 = str(element)
    prob_0 = prob_0[0:5]
    probability_clf.append(prob_0)

In [37]:
#5b. Calculate individual probabilities for the Logistic Model prediction
prob = (lrc1.predict_proba(input_row_vectorized)[0] *100)

probability_lrc = []

for element in prob:
    prob_0 = str(element)
    prob_0 = prob_0[0:5]
    probability_lrc.append(prob_0)

In [38]:
print(f'SGD Model Prediction      : {probability_clf[0]}% negative / {probability_clf[1]}% positive')
print(f'Logistic Model Prediction : {probability_lrc[0]}% negative / {probability_lrc[1]}% positive')

SGD Model Prediction      : 35.19% negative / 64.80% positive
Logistic Model Prediction : 35.26% negative / 64.73% positive
