# Importing the libraries

In [20]:
import string
import re
import pandas as pd
import nltk

from nltk.corpus import stopwords

# Dataset

In [3]:
df = pd.read_csv("spotify_reviews.csv")
df.head()

Unnamed: 0,Review,label
0,"Great music service, the audio is high quality...",POSITIVE
1,Please ignore previous negative rating. This a...,POSITIVE
2,"This pop-up ""Get the best Spotify experience o...",NEGATIVE
3,Really buggy and terrible to use as of recently,NEGATIVE
4,Dear Spotify why do I get songs that I didn't ...,NEGATIVE


In [4]:
df.shape

(52702, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52702 entries, 0 to 52701
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  52686 non-null  object
 1   label   52702 non-null  object
dtypes: object(2)
memory usage: 823.6+ KB


In [6]:
df['label'].value_counts()

label
NEGATIVE    29423
POSITIVE    23279
Name: count, dtype: int64

In [7]:
reviews = df['Review']

# print(type(df['Review'][0])) -> string
# print(df['Review'][0]) -> first review

# there are some null values in df (16 values)
# for rev in df['Review']:
#     if(type(rev) == float):
#         print(rev)

# remove the empty rows from the df
df = df.dropna(how='any')
df.shape

(52686, 2)

# PreProcessing of text

Lowering of the text
1. python is a case sensitive language so it treats two same words differently when in lower or upper cases
2. lower casing reduces the vocab size and reduces complexity
3. for eg - Basic and basic will be treated differently

In [8]:
def lowering(text):
    return text.lower()

# for datasets -> df['Review'].str.lower()

text = 'I have a Good Sense of Humour'
print(lowering(text))

print("--------")

# print(df['Review'][0].lower())
df['Review'] = df['Review'].apply(lowering)
df['Review']

i have a good sense of humour
--------


0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
52697                                         yes the best
52698    spotify won my heart in feb 2024 you won my he...
52699    i tried to open the app and it wont open i res...
52700                                                good 
52701    nice app to play music and at very affordable ...
Name: Review, Length: 52686, dtype: object

Remove HTML Tags
1. Decrease complexity
2. There is no semantic meaning in the tags

In [9]:
regex = '<.*?>'

def removeHTMLTags(text):
    pattern = re.compile(regex)
    return pattern.sub(r'', text)

text = '<html><body><p>this is a paragraph<p/>hello<br>break text<br/></body>'
print(removeHTMLTags(text))
print('--------')

df['Review'] = df['Review'].apply(removeHTMLTags)
df['Review']

this is a paragraphhellobreak text
--------


0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
52697                                         yes the best
52698    spotify won my heart in feb 2024 you won my he...
52699    i tried to open the app and it wont open i res...
52700                                                good 
52701    nice app to play music and at very affordable ...
Name: Review, Length: 52686, dtype: object

Remove URLs

In [10]:
regex = 'https?://\S+|www\.\S+'

def removeURLs(text):
    pattern = re.compile(regex)
    return pattern.sub(r'', text)

text = 'www.google.com and https://youtube.com are url of google and youtube'
print(removeURLs(text))

df['Review'] = df['Review'].apply(removeURLs)
df['Review']

 and  are url of google and youtube


0        great music service, the audio is high quality...
1        please ignore previous negative rating. this a...
2        this pop-up "get the best spotify experience o...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didn't ...
                               ...                        
52697                                         yes the best
52698    spotify won my heart in feb 2024 you won my he...
52699    i tried to open the app and it wont open i res...
52700                                                good 
52701    nice app to play music and at very affordable ...
Name: Review, Length: 52686, dtype: object

Remove Punctuation
1. decreases the vocab size

In [11]:
punctuation = string.punctuation

print(punctuation)

# slow method
def removePuncSlow(text):
    for p in punctuation:
        text = text.replace(p,"")
    return text

# fast method
def removePuncFast(text):
    return text.translate(str.maketrans('','',punctuation))

text = "Hello! How are you? I am fine.#"
print(removePuncFast(text))

df['Review'] = df['Review'].apply(removePuncFast)
df['Review']

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Hello How are you I am fine


0        great music service the audio is high quality ...
1        please ignore previous negative rating this ap...
2        this popup get the best spotify experience on ...
3          really buggy and terrible to use as of recently
4        dear spotify why do i get songs that i didnt p...
                               ...                        
52697                                         yes the best
52698    spotify won my heart in feb 2024 you won my he...
52699    i tried to open the app and it wont open i res...
52700                                                good 
52701    nice app to play music and at very affordable ...
Name: Review, Length: 52686, dtype: object

Removing the Stopwords

In [12]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pulkitsinghal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stopwords = stopwords.words('english')
size = len(stopwords)
print('Size of Stopwords in english', size)

def removeStopwords(text):
    new_text = []
    
    for word in text.split(" "):
        if word in stopwords:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

text = 'there is a good chance of india winning this match'
print(removeStopwords(text))
print('-------')

df['Review'] = df['Review'].apply(removeStopwords)
df['Review']

Size of Stopwords in english 179
   good chance  india winning  match
-------


0        great music service  audio  high quality   app...
1        please ignore previous negative rating  app  s...
2         popup get  best spotify experience  android 1...
3                   really buggy  terrible  use   recently
4        dear spotify    get songs   didnt put   playli...
                               ...                        
52697                                            yes  best
52698    spotify   heart  feb 2024    heart  music  lyr...
52699     tried  open  app   wont open  restarted  phon...
52700                                                good 
52701             nice app  play music    affordable price
Name: Review, Length: 52686, dtype: object

Removing/Handling Emojis

In [17]:
def removeEmojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

text = 'Hello! 😊It is a good day😊😊'
print(removeEmojis(text))

df['Review'] = df['Review'].apply(removeEmojis)
df['Review']

Hello! It is a good day


0        great music service  audio  high quality   app...
1        please ignore previous negative rating  app  s...
2         popup get  best spotify experience  android 1...
3                   really buggy  terrible  use   recently
4        dear spotify    get songs   didnt put   playli...
                               ...                        
52697                                            yes  best
52698    spotify   heart  feb 2024    heart  music  lyr...
52699     tried  open  app   wont open  restarted  phon...
52700                                                good 
52701             nice app  play music    affordable price
Name: Review, Length: 52686, dtype: object

In [21]:
chat_words = {"LMAO": "laughing my ass off", "AFAIK": "as far as i know"} # so on for many

def removeChatWords(text):
    words = []
    for word in text.split(" "):
        if word.upper() in chat_words:
            words.append(chat_words[word.upper()])
        else:
            words.append(word)
    new_text = " ".join(words)
    return new_text

text = 'LMAO funny'
print(removeChatWords(text))

df['Review'] = df['Review'].apply(removeChatWords)
df['Review']

laughing my ass off funny


0        great music service  audio  high quality   app...
1        please ignore previous negative rating  app  s...
2         popup get  best spotify experience  android 1...
3                   really buggy  terrible  use   recently
4        dear spotify    get songs   didnt put   playli...
                               ...                        
52697                                            yes  best
52698    spotify   heart  feb 2024    heart  music  lyr...
52699     tried  open  app   wont open  restarted  phon...
52700                                                good 
52701             nice app  play music    affordable price
Name: Review, Length: 52686, dtype: object