In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string



In [8]:
df = pd.read_csv("chatgpt_reviews.csv")

df.head()

Unnamed: 0,date,title,review,rating
0,2023-05-21 16:42:24,Much more accessible for blind users than the ...,Up to this point I’ve mostly been using ChatGP...,4
1,2023-07-11 12:24:19,"Much anticipated, wasn’t let down.",I’ve been a user since it’s initial roll out a...,4
2,2023-05-19 10:16:22,"Almost 5 stars, but… no search function",This app would almost be perfect if it wasn’t ...,4
3,2023-05-27 21:57:27,"4.5 stars, here’s why","I recently downloaded the app and overall, it'...",4
4,2023-06-09 07:49:36,"Good, but Siri support would take it to the ne...",I appreciate the devs implementing Siri suppor...,4


In [9]:
df.rating.value_counts()

rating
5    1134
1     495
4     304
3     220
2     139
Name: count, dtype: int64

In [12]:
df.drop(columns=["date", "title"], inplace=True)

df.head()


Unnamed: 0,review,rating
0,Up to this point I’ve mostly been using ChatGP...,4
1,I’ve been a user since it’s initial roll out a...,4
2,This app would almost be perfect if it wasn’t ...,4
3,"I recently downloaded the app and overall, it'...",4
4,I appreciate the devs implementing Siri suppor...,4


In [13]:
df[df.duplicated()]

Unnamed: 0,review,rating
1351,Thanks,5
1508,Please,5
1783,Love it,5
1800,First,5
1801,First,5
1802,First,5
1827,Why?,1
1853,Very good,5
1866,Pretty good,5
1869,Great,5


In [14]:
df.drop_duplicates(inplace=True)

df[df.duplicated()]

Unnamed: 0,review,rating


In [15]:
df.isna().sum()

review    0
rating    0
dtype: int64

In [16]:
df.isnull().sum()

review    0
rating    0
dtype: int64

In [17]:
df.rating.unique()

array([4, 1, 3, 5, 2], dtype=int64)

In [19]:
def case_folding(text):
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\n+", "", text)
    text = re.sub(r"RT[\s]+", "", text)
    text = re.sub(r"https?://\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U00010000-\U0010ffff"
                               "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r"", text)

    return text.lower()

df.review = df.review.apply(case_folding)

df.review


0       up to this point i’ve mostly been using chatgp...
1       i’ve been a user since it’s initial roll out a...
2       this app would almost be perfect if it wasn’t ...
3       i recently downloaded the app and overall its ...
4       i appreciate the devs implementing siri suppor...
                              ...                        
2287    this has to be a beginning to something crazy ...
2288    i’ve been using chat and have been a proud pre...
2289    the chatgpt ios app is an outstanding product ...
2290    sam altman’s blatant attempt at regulatory cap...
2291                                                     
Name: review, Length: 2262, dtype: object

In [20]:
def tokenize(text):
    return word_tokenize(text)

df.review = df.review.apply(tokenize)

df.review

0       [up, to, this, point, i, ’, ve, mostly, been, ...
1       [i, ’, ve, been, a, user, since, it, ’, s, ini...
2       [this, app, would, almost, be, perfect, if, it...
3       [i, recently, downloaded, the, app, and, overa...
4       [i, appreciate, the, devs, implementing, siri,...
                              ...                        
2287    [this, has, to, be, a, beginning, to, somethin...
2288    [i, ’, ve, been, using, chat, and, have, been,...
2289    [the, chatgpt, ios, app, is, an, outstanding, ...
2290    [sam, altman, ’, s, blatant, attempt, at, regu...
2291                                                   []
Name: review, Length: 2262, dtype: object

In [22]:
norm_word = {"iam":"i", "nt":"not", "t":"not"}

def normalize(doc):
    return [norm_word[word] if word in norm_word else word for word in doc]

df.review = df.review.apply(normalize)
df.review

0       [up, to, this, point, i, ’, ve, mostly, been, ...
1       [i, ’, ve, been, a, user, since, it, ’, s, ini...
2       [this, app, would, almost, be, perfect, if, it...
3       [i, recently, downloaded, the, app, and, overa...
4       [i, appreciate, the, devs, implementing, siri,...
                              ...                        
2287    [this, has, to, be, a, beginning, to, somethin...
2288    [i, ’, ve, been, using, chat, and, have, been,...
2289    [the, chatgpt, ios, app, is, an, outstanding, ...
2290    [sam, altman, ’, s, blatant, attempt, at, regu...
2291                                                   []
Name: review, Length: 2262, dtype: object

In [23]:
stopw = set(stopwords.words())

def stopword(doc):
    return [word for word in doc if word not in stopw]

df.review = df.review.apply(stopword)
df.review

0       [point, ’, chatgpt, windows, desktop, google, ...
1       [’, user, ’, initial, roll, waiting, mobile, a...
2       [app, perfect, ’, “, search, ”, function, imag...
3       [recently, downloaded, app, great, platform, e...
4       [appreciate, devs, implementing, siri, support...
                              ...                        
2287    [beginning, crazy, letting, ’, ’, day, ima, fi...
2288    [’, chat, proud, premium, subscriber, awhile, ...
2289    [chatgpt, ios, app, outstanding, product, seam...
2290    [sam, altman, ’, blatant, attempt, regulatory,...
2291                                                   []
Name: review, Length: 2262, dtype: object

In [24]:
stemmer = PorterStemmer()
def stemming(doc):
    return [stemmer.stem(word) for word in doc]

df.review = df.review.apply(stemming)
df.review

0       [point, ’, chatgpt, window, desktop, googl, ch...
1       [’, user, ’, initi, roll, wait, mobil, applic,...
2       [app, perfect, ’, “, search, ”, function, imag...
3       [recent, download, app, great, platform, excel...
4       [appreci, dev, implement, siri, support—it, en...
                              ...                        
2287    [begin, crazi, let, ’, ’, day, ima, find, comm...
2288    [’, chat, proud, premium, subscrib, awhil, hel...
2289    [chatgpt, io, app, outstand, product, seamless...
2290    [sam, altman, ’, blatant, attempt, regulatori,...
2291                                                   []
Name: review, Length: 2262, dtype: object