In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import bert
from tqdm import tqdm
from collections import namedtuple

In [2]:
# Read the sentiment dataset
df = pd.read_csv("D:\\Study\\DataScience\\Projects\\AI-Advertising\\SentimentAnalysis\\sentiments.csv")
df.isnull().values.any()
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
data = df[:15000]

###  Data Preprocessing

In [13]:
# The first step in any NLP task is to clean or pre-process the text data

# Preprocess the data
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [14]:
X = []
sentences = list(data['review'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [15]:
y = data['sentiment']

y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

### Bag of Words method

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer(min_df=3)
X_bow = CV.fit_transform(X).toarray()

In [17]:
from sklearn.model_selection import train_test_split
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, y, test_size=0.2, random_state=0)

In [18]:
from sklearn.naive_bayes import GaussianNB
model_bow = GaussianNB()
model_bow.fit(X_train_bow, y_train_bow)

GaussianNB()

In [19]:
y_predict_bow = model_bow.predict(X_test_bow)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

print(confusion_matrix(y_test_bow, y_predict_bow))
print(classification_report(y_test_bow, y_predict_bow))
accuracy_bow = accuracy_score(y_test_bow, y_predict_bow)
f1_bow = f1_score(y_test_bow, y_predict_bow)
print('Accuracy of BOW model is: {:.2f}'.format(accuracy_bow))
print('f1 Score of BOW model is: {:.2f}'.format(f1_bow))

[[1154  376]
 [ 642  828]]
              precision    recall  f1-score   support

           0       0.64      0.75      0.69      1530
           1       0.69      0.56      0.62      1470

    accuracy                           0.66      3000
   macro avg       0.67      0.66      0.66      3000
weighted avg       0.66      0.66      0.66      3000

Accuracy of BOW model is: 0.66
f1 Score of BOW model is: 0.62


In [None]:
import joblib
# Save the BOW model to disk
filename = 'nlp_bow.pkl'
joblib.dump(model_bow, filename)

### TF-IDF Method

In [22]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
X_tfidf = vectorizer.fit_transform(X).toarray()

In [24]:
from sklearn.model_selection import train_test_split

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=0)

In [25]:
from sklearn.ensemble import RandomForestClassifier

model_tfidf = RandomForestClassifier(n_estimators=200, random_state=0)
model_tfidf.fit(X_train_tfidf, y_train_tfidf)

RandomForestClassifier(n_estimators=200, random_state=0)

In [26]:
y_predict_tfidf = model_tfidf.predict(X_test_tfidf)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

print(confusion_matrix(y_test_tfidf, y_predict_tfidf))
print(classification_report(y_test_tfidf, y_predict_tfidf))
accuracy_tfidf = accuracy_score(y_test_tfidf, y_predict_tfidf)
f1_tfidf = f1_score(y_test_tfidf, y_predict_tfidf)
print('Accuracy of TF-IDF model is: {:.2f}'.format(accuracy_tfidf))
print('f1 Score of TF-IDF model is: {:.2f}'.format(f1_tfidf))

[[1306  224]
 [ 249 1221]]
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      1530
           1       0.84      0.83      0.84      1470

    accuracy                           0.84      3000
   macro avg       0.84      0.84      0.84      3000
weighted avg       0.84      0.84      0.84      3000

Accuracy of TF-IDF model is: 0.84
f1 Score of TF-IDF model is: 0.84


In [29]:
# Save the TF-IDF model to disk
filename = 'nlp_tfidf.pkl'
joblib.dump(model_tfidf, filename)

['nlp_tfidf.pkl']