In [1]:
import re
import string
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')


from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet



from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report





  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_csv('app_reviews.csv',delimiter=';')
print("shape = ", df.shape)
print(df.columns)
df = df.dropna(axis=0)
print("shape = ", df.shape)

shape =  (132, 5)
Index(['title', 'rating', 'snippet', 'likes', 'date'], dtype='object')
shape =  (130, 5)


In [3]:
list_to_drop = ['title', 'likes', 'date']
df.drop(list_to_drop,axis=1,inplace=True)

In [4]:
print(df.head)

<bound method NDFrame.head of      rating                                            snippet
0       3.0  Very slow, it will take weeks sometimes to get...
1       1.0  I have never been able to even reach the level...
2       1.0  I used to like this app, and it was great for ...
3       1.0  A poor application, it requires a Facebook acc...
4       2.0  The app crashes every minute, it's annoying. T...
..      ...                                                ...
127     1.0                                  they are spammers
128     1.0                                     bad experience
129     1.0                                very bad experience
130     1.0                                    horrible !!!!!!
131     5.0                                            amazing

[130 rows x 2 columns]>


### Labeling

In [5]:
def label_reviews(rating):
    if rating in [1, 2]:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

df['rating'] = df['rating'].apply(label_reviews)
#print(df['rating'])
print(df.head)

<bound method NDFrame.head of        rating                                            snippet
0     Neutral  Very slow, it will take weeks sometimes to get...
1    Negative  I have never been able to even reach the level...
2    Negative  I used to like this app, and it was great for ...
3    Negative  A poor application, it requires a Facebook acc...
4    Negative  The app crashes every minute, it's annoying. T...
..        ...                                                ...
127  Negative                                  they are spammers
128  Negative                                     bad experience
129  Negative                                very bad experience
130  Negative                                    horrible !!!!!!
131  Positive                                            amazing

[130 rows x 2 columns]>


In [6]:
df['rating'].value_counts()

rating
Negative    84
Positive    39
Neutral      7
Name: count, dtype: int64

In [7]:
# some text cleaning functions
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))



", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [8]:
df['snippet'] = df['snippet'].apply(lambda x: convert_to_lower(x))
df['snippet'] = df['snippet'].apply(lambda x: remove_numbers(x))
df['snippet'] = df['snippet'].apply(lambda x: remove_punctuation(x))
df['snippet'] = df['snippet'].apply(lambda x: remove_stopwords(x))
df['snippet'] = df['snippet'].apply(lambda x: remove_extra_white_spaces(x))
#df['snippet'] = df['snippet'].apply(lambda x: lemmatize_words(x))
     

In [9]:
# converting string labels to int labels

rating_map = {
    'Negative' : 0,
    'Neutral' : 1,
    'Positive' : 2
}

df['rating'] = df['rating'].map(rating_map)


In [10]:
df.head()

Unnamed: 0,rating,snippet
0,1,slow take weeks sometimes get order looks good...
1,0,never able even reach level ordering anything ...
2,0,used like app great first couple months whats ...
3,0,poor application requires facebook account sig...
4,0,app crashes every minute annoying customer ser...


In [11]:
X = df['snippet']
X.head()

0    slow take weeks sometimes get order looks good...
1    never able even reach level ordering anything ...
2    used like app great first couple months whats ...
3    poor application requires facebook account sig...
4    app crashes every minute annoying customer ser...
Name: snippet, dtype: object

In [12]:
y = df['rating']
y.head()

0    1
1    0
2    0
3    0
4    0
Name: rating, dtype: int64

In [14]:
#installing libraries
!pip install transformers
!pip install nlpaug



In [None]:
#demo of how the augumenter works
import nlpaug.augmenter.word.context_word_embs as aug
sample_text = df['snippet'].iloc[100]
sample_text

'new update amazing'

In [17]:
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

In [18]:
import torch
augmented_sample_text = augmenter.augment(sample_text)

In [22]:
df['rating'].value_counts()

rating
0    84
2    39
1     7
Name: count, dtype: int64

In [28]:
df.head()

Unnamed: 0,rating,snippet
0,1,slow take weeks sometimes get order looks good...
1,0,never able even reach level ordering anything ...
2,0,used like app great first couple months whats ...
3,0,poor application requires facebook account sig...
4,0,app crashes every minute annoying customer ser...


In [None]:
import pandas as pd
from sklearn.utils import shuffle

def augmentMyData(df, augmenter, rating, repetitions=1, samples=200):
    augmented_texts = []
    # Select only the minority class samples
    minority_df = df[df['rating'] == rating].reset_index(drop=True) 
    for i in tqdm(np.random.randint(0, len(minority_df), samples)):
        for _ in range(repetitions):
            augmented_text = augmenter.augment(minority_df['snippet'].iloc[i])
            augmented_texts.append(augmented_text)
        return df  # Return original dataframe if no augmentation occurred
    data = {
        'rating': [rating] * len(augmented_texts),
        'snippet': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(pd.concat([df, aug_df], ignore_index=True))
    return df

In [45]:
aug_df = augmentMyData(df, augmenter, 2,samples=40)

100%|██████████| 40/40 [00:04<00:00,  8.21it/s]


In [47]:
aug_df['rating'].value_counts()

rating
0    84
2    79
1     7
Name: count, dtype: int64

In [51]:
aug_df = augmentMyData(aug_df, augmenter, 1, samples=73)

100%|██████████| 73/73 [00:44<00:00,  1.62it/s]


In [55]:
print(aug_df['rating'].value_counts())
print("Original: ", df.shape)
print("Augmented: ", aug_df.shape)

rating
0    84
1    80
2    79
Name: count, dtype: int64
Original:  (130, 2)
Augmented:  (243, 2)


In [None]:
# split the dataset before augmenting to avoid augmented data in valid set
X_train, _, y_train, _ = train_test_split(aug_df['snippet'], aug_df['rating'].values, test_size=0.1)
_, X_test, _, y_test = train_test_split(df['snippet'], df['rating'].values, test_size=0.5)