In [1]:
## for data
import json
import pandas as pd
import numpy as np
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for processing
import re
import nltk
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## for explainer
# from lime import lime_text
## for word embedding
import gensim
import gensim.downloader as gensim_api
## for deep learning
import tensorflow as tf
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
## for bert language model
import transformers



In [46]:
df = pd.read_csv('../data/amazon_reviews_small.csv')

In [47]:
df['star_rating'].value_counts()

5    6825
4    2164
3     659
2     231
1     121
Name: star_rating, dtype: int64

In [48]:
df['review'] = df['review_headline'] + '. ' + df['review_body']

In [49]:
df['review_sentiments'] = 0

In [50]:
df['review_sentiments'][df['star_rating'].isin([4, 5])] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [51]:
df

Unnamed: 0.1,Unnamed: 0,review_headline,review_body,star_rating,review,review_sentiments
0,0,Good book,This is a very good book. I recommend it for f...,5,Good book. This is a very good book. I recomme...,1
1,1,the Marenon Chronically Series,Loved all three novels! Morrow is a good stor...,5,the Marenon Chronically Series. Loved all thre...,1
2,2,GOOD READ,Made me think about the fine line between life...,5,GOOD READ. Made me think about the fine line b...,1
3,3,excellant family devotion,I had been looking for a family devotional gea...,5,excellant family devotion. I had been looking ...,1
4,4,Great read,"So entertaining, not a dull moment in the enti...",5,"Great read. So entertaining, not a dull moment...",1
...,...,...,...,...,...,...
9995,9995,Awesome,I haven't read a book like this in a while.lov...,5,Awesome. I haven't read a book like this in a ...,1
9996,9996,Absolutely GREAT,I started reading it because it was on my sons...,5,Absolutely GREAT. I started reading it because...,1
9997,9997,Sicko,These are two writers with some serious macabr...,1,Sicko. These are two writers with some serious...,0
9998,9998,Just too small,It was just too small for the kindle. I could ...,2,Just too small. It was just too small for the ...,0


In [52]:
df = df.drop(columns=['Unnamed: 0', 'review_headline', 'review_body', 'star_rating'])

In [53]:
df

Unnamed: 0,review,review_sentiments
0,Good book. This is a very good book. I recomme...,1
1,the Marenon Chronically Series. Loved all thre...,1
2,GOOD READ. Made me think about the fine line b...,1
3,excellant family devotion. I had been looking ...,1
4,"Great read. So entertaining, not a dull moment...",1
...,...,...
9995,Awesome. I haven't read a book like this in a ...,1
9996,Absolutely GREAT. I started reading it because...,1
9997,Sicko. These are two writers with some serious...,0
9998,Just too small. It was just too small for the ...,0


In [54]:
def stratified_sample_df(df, col, n_samples):
    n = min(n_samples, df[col].value_counts().min())
    df_ = df.groupby(col).apply(lambda x: x.sample(n))
    df_.index = df_.index.droplevel(0)
    return df_

In [59]:
df_small = stratified_sample_df(df, 'review_sentiments', 1000)

In [60]:
df_small['review_sentiments'].value_counts()

0    1000
1    1000
Name: review_sentiments, dtype: int64

In [61]:
from transformers import pipeline
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [62]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [63]:
df_small_lst = df_small['review'].to_list()

In [65]:
tokenized_datasets = tokenizer(df_small_lst, padding="max_length", truncation=True)

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [66]:
from sklearn.model_selection import train_test_split

In [68]:
X_temp, X_test, y_temp, y_test = train_test_split(df_small['review'], 
                                                  df_small['review_sentiments'], 
                                                  stratify=df_small['review_sentiments'], 
                                                  test_size=0.25, random_state=42)

In [69]:
X_train, X_eval, y_train, y_eval = train_test_split(X_temp, y_temp, stratify=y_temp, 
                                                    test_size=500, random_state=42)

In [80]:
tf_train_dataset = tokenizer(X_train.to_list(), padding="max_length", truncation=True)
tf_eval_dataset = tokenizer(X_eval.to_list(), padding="max_length", truncation=True)
tf_test_dataset = tokenizer(X_test.to_list(), padding="max_length", truncation=True)

In [81]:
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased",
                                                             num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [82]:
train_features = {x: tf_train_dataset[x].to_tensor() for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, y_train))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

eval_features = {x: tf_eval_dataset[x].to_tensor() for x in tokenizer.model_input_names}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, y_eval))
eval_tf_dataset = eval_tf_dataset.batch(8)

AttributeError: 'list' object has no attribute 'to_tensor'

In [83]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

In [87]:
model.fit(tf_train_dataset, y_test.with_format("tensorflow"), 
          validation_data=(tf_eval_dataset, y_eval.with_format("tensorflow")),
          batch_size=8, epochs=3)

AttributeError: 'Series' object has no attribute 'with_format'

In [88]:
from datasets import load_metric

metric = load_metric("accuracy")

ModuleNotFoundError: No module named 'datasets'

In [None]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tf_batch_short = tokenizer(
    df_lst_short, 
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="tf"
)

In [None]:
tf_outputs_short = tf_model(tf_batch_short)

In [None]:
tf_outputs_short

In [None]:
results = tf.nn.softmax(tf_outputs_short.logits, axis=-1)

In [None]:
df[:50]

In [None]:
results[0][1]

In [None]:
res_rev = []
for i, val in enumerate(results):
    res_rev.append(int(val[1] >= 0.5))

In [None]:
res_rev

In [None]:
(res_rev - df['review_sentiments'][:200])[150:200]