#ChatGPT Reviews Analysis with Python

In [1]:
#importing necessory Libraries to Perform analysis
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import plotly.express as px
import plotly.io as pio
pio.templates.default="plotly_white"

In [8]:
#Loading the Dataset
df = pd.read_csv("/content/chatgpt_reviews.csv")

In [9]:
df.head(10)

Unnamed: 0,Review Id,Review,Ratings,Review Date
0,6fb93778-651a-4ad1-b5ed-67dd0bd35aac,good,5.0,2024-08-23 19:30:05
1,81caeefd-3a28-4601-a898-72897ac906f5,good,5.0,2024-08-23 19:28:18
2,452af49e-1d8b-4b68-b1ac-a94c64cb1dd5,nice app,5.0,2024-08-23 19:22:59
3,372a4096-ee6a-4b94-b046-cef0b646c965,"nice, ig",5.0,2024-08-23 19:20:50
4,b0d66a4b-9bde-4b7c-8b11-66ed6ccdd7da,"this is a great app, the bot is so accurate to...",5.0,2024-08-23 19:20:39
5,9cdc3f78-15cd-4e9d-9287-31bc5af496c4,so nice app,5.0,2024-08-23 19:18:35
6,3c26ee92-4d72-45ee-9242-caab930903ca,"just love this app, or guy",4.0,2024-08-23 19:14:40
7,f293ed19-688a-42e2-8282-ddd077771a94,great app,1.0,2024-08-23 19:12:00
8,99af016c-6db2-4f24-9952-cb3bece40be8,Very impressive,5.0,2024-08-23 19:11:46
9,27dd6ea5-ada3-4795-a297-6faa7d4b6ba2,helps with coding a lot this ai is great for e...,5.0,2024-08-23 19:11:09


In [10]:
#Counting Null values
df.isnull().sum()

Unnamed: 0,0
Review Id,0
Review,3
Ratings,1
Review Date,1


In [11]:
df.describe()

Unnamed: 0,Ratings
count,147166.0
mean,4.45344
std,1.129632
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [12]:
#Replacing all the null values of Review column with Null string for Smooth Analysis
df['Review'] = df['Review'].astype(str).fillna('')

In [14]:
# importing textblob library for text analysis
from textblob import TextBlob

In [15]:
# extracting the Sentiment from the reviews and Categorizing into Three Categories
def get_sentiment(review):
    sentiment=TextBlob(review).sentiment.polarity
    if sentiment>0:
        return "positive"
    elif sentiment<0:
        return "negative"
    else:
        return "neutral"




In [16]:
df['sentiment']=df['Review'].apply(get_sentiment)


In [18]:
#Counting the sentiment of different categories
sentiment_distribution=df['sentiment'].value_counts()

In [19]:
sentiment_distribution

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,114587
neutral,25352
negative,7228


In [20]:
# importing Plotly library to perform the graph visualization task
import plotly.graph_objects  as go

fig = go.Figure(data=[go.Bar(
    x=sentiment_distribution.index,
    y=sentiment_distribution.values,
    marker_color=['green', 'gray', 'red'],  # Customize colors if desired
)])

fig.update_layout(
    title='Sentiment Distribution of ChatGPT Reviews',
    xaxis_title='Sentiment',
    yaxis_title='Number of Reviews',
    width=800,
    height=600
)

fig.show()

Analyzing What Users Like About ChatGPT


In [24]:
# filter reviews with positive sentiment
positive_reviews = df[df['sentiment'] == 'positive']['Review']

# use CountVectorizer to extract common phrases (n-grams)
vectorizer = CountVectorizer(ngram_range=(2, 3), stop_words='english', max_features=100)
X = vectorizer.fit_transform(positive_reviews)

# sum the counts of each phrase
phrase_counts = X.sum(axis=0)
phrases = vectorizer.get_feature_names_out()
phrase_freq = [(phrases[i], phrase_counts[0, i]) for i in range(len(phrases))]

# sort phrases by frequency
phrase_freq = sorted(phrase_freq, key=lambda x: x[1], reverse=True)

phrase_df = pd.DataFrame(phrase_freq, columns=['Phrase', 'Frequency'])

fig = px.bar(phrase_df,
             x='Frequency',
             y='Phrase',
             orientation='h',
             title='Top Common Phrases in Positive Reviews',
             labels={'Phrase': 'Phrase', 'Frequency': 'Frequency'},
             width=1000,
             height=600)

fig.update_layout(
    xaxis_title='Frequency',
    yaxis_title='Phrase',
    yaxis={'categoryorder':'total ascending'}
)

fig.show()

Analyzing What Users Don’t Like About ChatGPT

In [26]:
# filter reviews with negative sentiment
negative_reviews = df[df['sentiment'] == 'negative']['Review']

# use CountVectorizer to extract common phrases (n-grams) for negative reviews
X_neg = vectorizer.fit_transform(negative_reviews)

# sum the counts of each phrase in negative reviews
phrase_counts_neg = X_neg.sum(axis=0)
phrases_neg = vectorizer.get_feature_names_out()
phrase_freq_neg = [(phrases_neg[i], phrase_counts_neg[0, i]) for i in range(len(phrases_neg))]

# sort phrases by frequency
phrase_freq_neg = sorted(phrase_freq_neg, key=lambda x: x[1], reverse=True)

phrase_neg_df = pd.DataFrame(phrase_freq_neg, columns=['Phrase', 'Frequency'])

fig = px.bar(phrase_neg_df,
             x='Frequency',
             y='Phrase',
             orientation='h',
             title='Top Common Phrases in Negative Reviews',
             labels={'Phrase': 'Phrase', 'Frequency': 'Frequency'},
             width=1000,
             height=600)

fig.update_layout(
    xaxis_title='Frequency',
    yaxis_title='Phrase',
    yaxis={'categoryorder':'total ascending'}
)

fig.show()

Common Problems Faced by Users in ChatGPT

In [27]:
# grouping similar phrases into broader problem categories
problem_keywords = {
    'Incorrect Answers': ['wrong answer', 'gives wrong', 'incorrect', 'inaccurate', 'wrong'],
    'App Performance': ['slow', 'lag', 'crash', 'bug', 'freeze', 'loading', 'glitch', 'worst app', 'bad app', 'horrible', 'terrible'],
    'User Interface': ['interface', 'UI', 'difficult to use', 'confusing', 'layout'],
    'Features Missing/Not Working': ['feature missing', 'not working', 'missing', 'broken', 'not available'],
    'Quality of Responses': ['bad response', 'useless', 'poor quality', 'irrelevant', 'nonsense']
}

# initialize a dictionary to count problems
problem_counts = {key: 0 for key in problem_keywords.keys()}

# count occurrences of problem-related phrases in negative reviews
for phrase, count in phrase_freq_neg:
    for problem, keywords in problem_keywords.items():
        if any(keyword in phrase for keyword in keywords):
            problem_counts[problem] += count
            break

problem_df = pd.DataFrame(list(problem_counts.items()), columns=['Problem', 'Frequency'])

fig = px.bar(problem_df,
             x='Frequency',
             y='Problem',
             orientation='h',
             title='Common Problems Faced by Users in ChatGPT',
             labels={'Problem': 'Problem', 'Frequency': 'Frequency'},
             width=1000,
             height=600)

fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white',
    xaxis_title='Frequency',
    yaxis_title='Problem',
    yaxis={'categoryorder':'total ascending'}
)

fig.show()

Analyzing How Reviews Changed Over Time

In [32]:
# convert 'Review Date' to datetime format
df['Review Date'] = pd.to_datetime(df['Review Date'])

# aggregate sentiment counts by date
sentiment_over_time = df.groupby([df['Review Date'].dt.to_period('M'), 'sentiment']).size().unstack(fill_value=0)

# convert the period back to datetime for plotting
sentiment_over_time.index = sentiment_over_time.index.to_timestamp()

fig = go.Figure()

fig.add_trace(go.Scatter(x=sentiment_over_time.index, y=sentiment_over_time['positive'],
                         mode='lines', name='positive', line=dict(color='green')))
fig.add_trace(go.Scatter(x=sentiment_over_time.index, y=sentiment_over_time['neutral'],
                         mode='lines', name='neutral', line=dict(color='gray')))
fig.add_trace(go.Scatter(x=sentiment_over_time.index, y=sentiment_over_time['negative'],
                         mode='lines', name='negative', line=dict(color='red')))

fig.update_layout(
    title='Sentiment Trends Over Time',
    xaxis_title='Date',
    yaxis_title='Number of Reviews',
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend_title_text='Sentiment',
    xaxis=dict(showgrid=True, gridcolor='lightgray'),
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)

fig.show()

Analyzing How Often Users Promote ChatGPT

In [33]:
# define the categories based on the ratings
df['NPS Category'] = df['Ratings'].apply(lambda x: 'Promoter' if x == 5 else ('Passive' if x == 4 else 'Detractor'))

# calculate the percentage of each category
nps_counts = df['NPS Category'].value_counts(normalize=True) * 100

# calculate NPS
nps_score = nps_counts.get('Promoter', 0) - nps_counts.get('Detractor', 0)

# display the NPS Score
nps_score

60.95727982496076

The Net Promoter Score (NPS) for ChatGPT, based on the ratings provided in the dataset, is approximately 60.95. It indicates a strong likelihood that users would recommend ChatGPT to others, as a score above 50 is generally considered excellent.