In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../data/raw/iphonese_reviews.csv')
data = data.rename(columns=str.lower)
data.head()

Unnamed: 0,ratings,comment,reviews
0,5,Super!,Great camera for pics and videos Battery life ...
1,5,Must buy!,Great device. Let me tell the Pros..1. Superb ...
2,5,Great product,"Who all loves older size i.e., 4.7 inch type s..."
3,5,Simply awesome,This iPhone SE is the best phone ever you get....
4,5,Classy product,This is my second iphone after iphone 4s. I’ve...


In [3]:
data.isnull().sum()

ratings    0
comment    0
reviews    0
dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9713 entries, 0 to 9712
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ratings  9713 non-null   int64 
 1   comment  9713 non-null   object
 2   reviews  9713 non-null   object
dtypes: int64(1), object(2)
memory usage: 227.8+ KB


In [5]:
display(data['ratings'].unique())

array([5, 4, 3, 1, 2])

## Using a pre-trained NLP Model
In this case I am going to use the cardiffnlp/twitter-roberta-base-sentiment-latest NLP Model from Hugging Face.
More info about this model can be found here: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

This is a text classification model that it's going to help to determine the sentiment of the reviews we have.

In [6]:
from transformers import AutoTokenizer ## tokenizer class 
from transformers import AutoModelForSequenceClassification ## generic model class
## from_pretrained()
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest" # model selected from huggingface. website for more info was provided before
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
data['id'] = data.index
data

Unnamed: 0,ratings,comment,reviews,id
0,5,Super!,Great camera for pics and videos Battery life ...,0
1,5,Must buy!,Great device. Let me tell the Pros..1. Superb ...,1
2,5,Great product,"Who all loves older size i.e., 4.7 inch type s...",2
3,5,Simply awesome,This iPhone SE is the best phone ever you get....,3
4,5,Classy product,This is my second iphone after iphone 4s. I’ve...,4
...,...,...,...,...
9708,5,Terrific purchase,Absolutely brilliantREAD MORE,9708
9709,5,Classy product,"Superb phone. This is my 4th iPhone, I feel SE...",9709
9710,5,Awesome,very niceREAD MORE,9710
9711,5,Super!,Loving it as of now. Good Product .READ MORE,9711


In [9]:
## need to get rid off READ MORE!!!
data['reviews'] = data['reviews'].str.replace('READ MORE', '') 

### Example test
I am going to take the first review in the data, tokenize it and check what the model returns as a sentiment

In [10]:
## encoding text
example = data['reviews'][0]
print(example)
encoded_example = tokenizer(example, return_tensors='pt') # it indicates that the output should be in PyTorch format
print(encoded_example)

Great camera for pics and videos Battery life is good so far with some setting turn of which i never use and when i use i turn those on and i use it in power saving mode all the time so a full day with light gaming of 1hr or more using camera for 1hr or more listening music in my car on youtube and Bluetooth on for couple of hours it gives me full day of battery varing from 4/5 to 6/7 hours sot per dayPerformance is top notch plays every game and every task with easeVery premium phone look...
{'input_ids': tensor([[    0, 19065,  2280,    13, 18803,     8,  3424, 21924,   301,    16,
           205,    98,   444,    19,   103,  2749,  1004,     9,    61,   939,
           393,   304,     8,    77,   939,   304,   939,  1004,   167,    15,
             8,   939,   304,    24,    11,   476,  6549,  5745,    70,     5,
            86,    98,    10,   455,   183,    19,  1109,  6548,     9,   112,
          5039,    50,    55,   634,  2280,    13,   112,  5039,    50,    55,
          6288

In [11]:
output = model(**encoded_example)
print(output) # raw, unnormalized predictions produced by the sequence classifier model
scores = output[0][0].detach().numpy()
## detach() creates a new tensor that shares the same data as the output.logits tensor but is not part of the computation graph
## need to check further on this :D
## numpy() converts the detached tensor into a NumPy array
print(scores)
scores = softmax(scores)
## using softmax to convert a vector of real numbers into a probability distribution
print('____________')
print(example)
print(scores)
## order for scores: negative, neutral, positive
print('____________')
## storing the results in a dictionary to make it easier to visualize
scores_dict = {
    'negative' : scores[0],
    'neutral' : scores[1],
    'positive' : scores[2]
}
print(scores_dict)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.4763, -0.1896,  2.7227]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
[-2.476266   -0.18955944  2.7227342 ]
____________
Great camera for pics and videos Battery life is good so far with some setting turn of which i never use and when i use i turn those on and i use it in power saving mode all the time so a full day with light gaming of 1hr or more using camera for 1hr or more listening music in my car on youtube and Bluetooth on for couple of hours it gives me full day of battery varing from 4/5 to 6/7 hours sot per dayPerformance is top notch plays every game and every task with easeVery premium phone look...
[0.00521013 0.0512806  0.9435093 ]
____________
{'negative': 0.0052101347, 'neutral': 0.051280603, 'positive': 0.9435093}


### Running the Model in all the data
In order to run the model for all the reviews I am going to create a function out of the previus example code. This function takes a review text, tokenizes it and uses the model to return sentiment scores.

After it, I am going to create a for loop that will loop through the entired iPhone SE reviews data set and stores the sentiment analysis for each review in a dictionary that later we will convert into a pandas dataframe.

In [12]:
def polarity_scores_roberta(text):
    '''
    Calculate polarity scores for sentiment analysis using a pre-trained RoBERTa model.
    
    This function takes in a piece of text (review) and returns a dictionary of polarity scores for three sentiment categories: negative, neutral, and positive. It utilizes a pre-trained RoBERTa model and a tokenizer for text encoding.

    Parameters:
        text (str): The input text for sentiment analysis. (Review)
    
    Returns:
        dict: A dictionary containing the polarity scores for negative, neutral, and positive sentiments.
    
    Example:
        text = "The phone camera is amazing!"
        polarity_scores = polarity_scores_roberta(text)
        print(polarity_scores)
        # Output: {'negative': 0.05, 'neutral': 0.1, 'positive': 0.85}
    
    '''
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
    'negative' : scores[0],
    'neutral' : scores[1],
    'positive' : scores[2]
    }
    return scores_dict

In [13]:
from tqdm import tqdm

results = {}
for i, row in tqdm(data.iterrows(), total=len(data)):
    try:
        text = row['reviews']
        myid = row['id']
        model_result = polarity_scores_roberta(text)
        model_result_rename = {}
        for key, value in model_result.items():
            model_result_rename[f'roberta_{key}'] = value
            results[myid] = model_result
    except RuntimeError:
        print(f'Broke for id {myid}')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 9713/9713 [10:23<00:00, 15.57it/s]


Once the Model has utilized all the data to create a sentiment analysis score for each review I am going to store the results into a pandas dataframe. 

In [17]:
## storing the results dict into a dataframe
results_df = pd.DataFrame(results).transpose()
## concatenating the original data frame with the new one
concat_df = pd.concat([results_df, data], axis=1)
## changing the order of the cols
cols_order = ['id', 'comment', 'reviews', 'ratings', 'negative', 'neutral', 'positive']
results_df = concat_df[cols_order]
## final df
results_df.head()

Unnamed: 0,id,comment,reviews,ratings,negative,neutral,positive
0,0,Super!,Great camera for pics and videos Battery life ...,5,0.00521,0.051281,0.943509
1,1,Must buy!,Great device. Let me tell the Pros..1. Superb ...,5,0.004157,0.017652,0.97819
2,2,Great product,"Who all loves older size i.e., 4.7 inch type s...",5,0.006962,0.043591,0.949447
3,3,Simply awesome,This iPhone SE is the best phone ever you get....,5,0.005833,0.020936,0.973231
4,4,Classy product,This is my second iphone after iphone 4s. I’ve...,5,0.013027,0.303008,0.683965


In [18]:
results_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,9713.0,4856.0,2804.045916,0.0,2428.0,4856.0,7284.0,9712.0
ratings,9713.0,4.456399,1.032911,1.0,4.0,5.0,5.0,5.0
negative,9713.0,0.124351,0.255358,0.00134,0.005352,0.011942,0.052735,0.961831
neutral,9713.0,0.139024,0.172841,0.005174,0.023033,0.06436,0.189758,0.931637
positive,9713.0,0.736625,0.336205,0.005376,0.651218,0.919075,0.971244,0.992542


In [19]:
## Creating a new column that specifies if the review is positive or negative
results_df['review_type'] = 0

results_df.loc[results_df['positive'] > 0.6, 'review_type'] = 1
results_df.loc[results_df['negative'] >= 0.5, 'review_type'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['review_type'] = 0


In [20]:
results_df['review_type'].value_counts()

review_type
1    7408
0    2305
Name: count, dtype: int64

In [21]:
results_df.to_csv("results_df.csv")