In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('reviews.csv', encoding='ISO-8859-1')

In [3]:
df

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,
...,...,...,...,...,...
61589,2022-01-01 03:01:29,Even though it was communicated that lyrics fe...,1,6,
61590,2022-01-01 02:13:40,"Use to be sooo good back when I had it, and wh...",1,0,
61591,2022-01-01 01:02:29,This app would be good if not for it taking ov...,2,10,
61592,2022-01-01 00:49:23,The app is good hard to navigate and won't jus...,2,1,


In [4]:
df = df[['Review', 'Rating']]

In [5]:
df

Unnamed: 0,Review,Rating
0,"Great music service, the audio is high quality...",5
1,Please ignore previous negative rating. This a...,5
2,"This pop-up ""Get the best Spotify experience o...",4
3,Really buggy and terrible to use as of recently,1
4,Dear Spotify why do I get songs that I didn't ...,1
...,...,...
61589,Even though it was communicated that lyrics fe...,1
61590,"Use to be sooo good back when I had it, and wh...",1
61591,This app would be good if not for it taking ov...,2
61592,The app is good hard to navigate and won't jus...,2


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [8]:
class TextNormalizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token.lower() for token in word_tokenize(X_copy[i])])
        return X_copy

In [9]:
norm = TextNormalizer()

In [10]:
norm.fit_transform(['An apple a day, keeps the doctor away!'])

['an apple a day , keeps the doctor away !']

In [11]:
class WordExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    
    def fit(self, X, y=None, **fit_params):
        self.general_freq = FreqDist()
        for document in X:
            tokens = word_tokenize(document)
            freq = FreqDist(tokens)
            self.general_freq.update(freq)
        self.hapaxes = self.general_freq.hapaxes()
        return self
    
    def transform(self, X, y=None, **fit_params):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([token for token in word_tokenize(X[i])
                                 if token not in self.hapaxes and
                                 token not in self.stop_words])
        return X_copy

In [12]:
stop_words = stopwords.words('english')

In [13]:
word_extractor = WordExtractor(stop_words)

In [14]:
corpus = [
    'John is a pretty boy',
    'Ann likes John',
    'Ann likes cherry',
    'Cherry is red'
]

In [15]:
word_extractor.fit_transform(corpus)

['John', 'Ann likes John', 'Ann likes', '']

In [16]:
class ApplyStemmer(BaseEstimator, TransformerMixin):
    def __init__(self, stemmer):
        self.stemmer = stemmer
    
    def fit(self, X, y=None, **fit_tranform):
        return self
    
    def transform(self, X, y=None, **fit_tranform):
        X_copy = X.copy()
        for i in range(len(X)):
            X_copy[i] = ' '.join([self.stemmer.stem(token) 
                                  for token in word_tokenize(X_copy[i])])
        return X_copy

In [17]:
porter_stemmer = PorterStemmer()

In [18]:
apply_stemmer = ApplyStemmer(porter_stemmer)

In [19]:
apply_stemmer.fit_transform(['An apple a day, keeps the doctor away!'])

['an appl a day , keep the doctor away !']

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
pipe = Pipeline([
    ("norm", TextNormalizer()),
    ("extractor", WordExtractor(stop_words)),
    ("stemmer", ApplyStemmer(PorterStemmer())),
    ("vectorizer", CountVectorizer()),
    ("logic", LogisticRegression())
])

In [39]:
X = df['Review'].values
y = df['Rating']. values

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [41]:
pipe.fit(X_train, y_train)

KeyboardInterrupt: 