# Importing own libraries

In [28]:
import pandas as pd
from preprocess.clean import lowering, removeChatWords, removeEmojis, removeHTMLTags, removePuncFast, removeURLs, removeStopwords

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Dataset

In [3]:
df = pd.read_csv("spotify_reviews.csv")
df.head()

Unnamed: 0,Review,label
0,"Great music service, the audio is high quality...",POSITIVE
1,Please ignore previous negative rating. This a...,POSITIVE
2,"This pop-up ""Get the best Spotify experience o...",NEGATIVE
3,Really buggy and terrible to use as of recently,NEGATIVE
4,Dear Spotify why do I get songs that I didn't ...,NEGATIVE


In [4]:
df.shape

(52702, 2)

In [5]:
print(df.shape)
df = df.dropna(how='any')
print(df.shape)

(52702, 2)
(52686, 2)


# Cleaning

In [8]:
df['Review'] = df['Review'].apply(removeChatWords)
df['Review'] = df['Review'].apply(removeEmojis)
df['Review'] = df['Review'].apply(removeHTMLTags)
df['Review'] = df['Review'].apply(removeStopwords)
df['Review'] = df['Review'].apply(removeURLs)
df['Review'] = df['Review'].apply(removePuncFast)
df['Review'] = df['Review'].apply(lowering)

Tokenisation

In [33]:
reviews = df.head(20)['Review']
# reviews

tokens = []

for rev in reviews:
    tok = word_tokenize(rev)
    tokens.append(tok)

# tot_reviews = len(tokens)
# tot_reviews
print(tokens)

[['great', 'music', 'service', 'audio', 'high', 'quality', 'app', 'easy', 'use', 'also', 'quick', 'friendly', 'support'], ['please', 'ignore', 'previous', 'negative', 'rating', 'this', 'app', 'super', 'great', 'i', 'give', 'five', 'stars'], ['this', 'popup', 'get', 'best', 'spotify', 'experience', 'android', '12', 'annoying', 'please', 'lets', 'get', 'rid', 'this'], ['really', 'buggy', 'terrible', 'use', 'recently'], ['dear', 'spotify', 'i', 'get', 'songs', 'i', 'put', 'playlist', 'and', 'shuffle', 'play'], ['the', 'player', 'controls', 'sometimes', 'disappear', 'reason', 'app', 'restart', 'forgets', 'i', 'playing', 'fixes', 'issue'], ['i', 'love', 'selection', 'lyrics', 'provided', 'song', 'listening', 'to'], ['still', 'extremely', 'slow', 'changing', 'storage', 'external', 'sd', 'card', 'im', 'convinced', 'done', 'purpose', 'spotify', 'knows', 'issue', 'done', 'nothing', 'solve', 'it', 'over', 'time', 'i', 'changed', 'sd', 'cards', 'faster', 'read', 'write', 'speedsall', 'samsung', '

Stemming

- To convert different words into their stem/root word
- for eg -> walk/walking/walked -> walk (root word)
- stemming is applied on individual tokens and the token gets converted into stemmed(root) token
- implemented as an algorithmic based technique (thus it is fast)

- Disadvantages -> root word may be not in that particular language
- Advantages -> fast compared to lemmatisation

In [30]:
porterStemmer = PorterStemmer()

def stemming(tokenList):
    for i in range(len(tokenList)):
        stemmedToken = porterStemmer.stem(tokenList[i])
        tokenList[i] = stemmedToken

tokenStem = tokens

for tok in tokenStem:
    stemming(tok)

# now all the individual tokens are stemmed in this list
print(tokenStem)


[['great', 'music', 'servic', 'audio', 'high', 'qualiti', 'app', 'easi', 'use', 'also', 'quick', 'friendli', 'support'], ['pleas', 'ignor', 'previou', 'neg', 'rate', 'thi', 'app', 'super', 'great', 'i', 'give', 'five', 'star'], ['thi', 'popup', 'get', 'best', 'spotifi', 'experi', 'android', '12', 'annoy', 'pleas', 'let', 'get', 'rid', 'thi'], ['realli', 'buggi', 'terribl', 'use', 'recent'], ['dear', 'spotifi', 'i', 'get', 'song', 'i', 'put', 'playlist', 'and', 'shuffl', 'play'], ['the', 'player', 'control', 'sometim', 'disappear', 'reason', 'app', 'restart', 'forget', 'i', 'play', 'fix', 'issu'], ['i', 'love', 'select', 'lyric', 'provid', 'song', 'listen', 'to'], ['still', 'extrem', 'slow', 'chang', 'storag', 'extern', 'sd', 'card', 'im', 'convinc', 'done', 'purpos', 'spotifi', 'know', 'issu', 'done', 'noth', 'solv', 'it', 'over', 'time', 'i', 'chang', 'sd', 'card', 'faster', 'read', 'write', 'speedsal', 'samsung', 'brand', 'and', 'pleas', 'add', 'dont', 'like', 'song', 'never', 'appea

Lemmatisation

- same functioning as stemming
- Advantages -> root word is in the language
- Disadvantages -> slow compared to stemming
- implemented as a searching technique from python disctionary

In [36]:
lemma = WordNetLemmatizer()

def lemmatisation(tokenList):
    for i in range(len(tokenList)):
        lemmaToken = lemma.lemmatize(tokenList[i],pos='v')
        tokenList[i] = lemmaToken

tokenLemma = tokens

for tok in tokenLemma:
    stemming(tok)

# now all the individual tokens are stemmed in this list
print(tokenLemma)

[['great', 'music', 'servic', 'audio', 'high', 'qualiti', 'app', 'easi', 'use', 'also', 'quick', 'friendli', 'support'], ['plea', 'ignor', 'previou', 'neg', 'rate', 'thi', 'app', 'super', 'great', 'i', 'give', 'five', 'star'], ['thi', 'popup', 'get', 'best', 'spotifi', 'experi', 'android', '12', 'annoy', 'plea', 'let', 'get', 'rid', 'thi'], ['realli', 'buggi', 'terribl', 'use', 'recent'], ['dear', 'spotifi', 'i', 'get', 'song', 'i', 'put', 'playlist', 'and', 'shuffl', 'play'], ['the', 'player', 'control', 'sometim', 'disappear', 'reason', 'app', 'restart', 'forget', 'i', 'play', 'fix', 'issu'], ['i', 'love', 'select', 'lyric', 'provid', 'song', 'listen', 'to'], ['still', 'extrem', 'slow', 'chang', 'storag', 'extern', 'sd', 'card', 'im', 'convinc', 'done', 'purpo', 'spotifi', 'know', 'issu', 'done', 'noth', 'solv', 'it', 'over', 'time', 'i', 'chang', 'sd', 'card', 'faster', 'read', 'write', 'speedsal', 'samsung', 'brand', 'and', 'plea', 'add', 'dont', 'like', 'song', 'never', 'appear', 