# Imports

In [1]:
import warnings

warnings.filterwarnings(action="ignore")

In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import model_selection
from sklearn import linear_model
from sentence_transformers import SentenceTransformer

# Get the Data
For this project we will use the Amazon customer review data.

This data has 20k rows with 76.2% positive and 23.8% negative reviews.

In [3]:
# load the data
df = pd.read_csv(
    "https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv"
)
df.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


## Explore data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  20000 non-null  object
 1   Positive    20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [5]:
# check the count of postive and negative reviews
df["Positive"].value_counts()

Positive
1    15233
0     4767
Name: count, dtype: int64

So,  we have 20000 reviews where 15.2k(76%) are positive and 4.7k(24%) are negative.

In [6]:
# let's rename columns for clarity
df = df.rename(columns={"reviewText": "reviews", "Positive": "label"})

# add a new column in the data to represent sentiment
df["sentiment"] = np.where(df["label"] == 1, "positive", "negative")

# show sample
df.sample(5)

Unnamed: 0,reviews,label,sentiment
688,Never got the rules or any idea how to play it...,0,negative
19046,I am not sure why this app does not work on my...,0,negative
5517,I have this game on my phone and my kindle fir...,1,positive
11601,seems to play more youtube videos than silk is...,1,positive
19460,I didn't like how this game was set up and was...,0,negative


# Develop Model

## Create samples

In [7]:
# create training sample
X = df["reviews"].values
y = df["label"].values

In [8]:
# split the dataset into train and test samples
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [9]:
# check the shape of full data
X.shape, y.shape

((20000,), (20000,))

In [10]:
# check the shape of train set
X_train.shape, y_train.shape

((16000,), (16000,))

In [11]:
# check the shape of test set
X_test.shape, y_test.shape

((4000,), (4000,))

## Extract features

In [12]:
# load sentence transformer model
embedding_model = "all-MiniLM-L6-v2"
text_model = SentenceTransformer(embedding_model)
text_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [13]:
# define a function to get document vectors
def get_document_vector(model: SentenceTransformer, data: list) -> np.array:
    document_vector = model.encode(
        sentences=data, convert_to_numpy=True, show_progress_bar=True
    )

    return document_vector

In [14]:
# get the embeddings for the train data
X_train = get_document_vector(text_model, X_train)
X_train.shape

Batches:   0%|          | 0/500 [00:00<?, ?it/s]

(16000, 384)

In [15]:
# get the embeddings for the test data
X_test = get_document_vector(text_model, X_test)
X_test.shape

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

(4000, 384)

## Train model

In [16]:
# train model
clf_model = linear_model.LogisticRegression()
clf_model.fit(X_train, y_train)

In [17]:
# check the train scores
clf_model.score(X_train, y_train)

0.9008125

In [18]:
# check the test scores
clf_model.score(X_test, y_test)

0.89575

## Evaluate model performance

In [19]:
# model inference
y_train_pred = clf_model.predict(X_train)
y_test_pred = clf_model.predict(X_test)

In [20]:
print(metrics.classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.83      0.74      0.78      3814
           1       0.92      0.95      0.94     12186

    accuracy                           0.90     16000
   macro avg       0.87      0.84      0.86     16000
weighted avg       0.90      0.90      0.90     16000



In [21]:
print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.82      0.73      0.77       953
           1       0.92      0.95      0.93      3047

    accuracy                           0.90      4000
   macro avg       0.87      0.84      0.85      4000
weighted avg       0.89      0.90      0.89      4000

