In [15]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go  #Create detailed, custom graphs.
from plotly.subplots import make_subplots  # Create Subplots
import plotly.io as pio
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer  
from textblob import TextBlob
pio.templates.default = "plotly_white" # Clean chart background

In [16]:
import plotly.io as pio
pio.renderers.default = 'notebook_connected'
pio.renderers.default = 'iframe_connected'

In [17]:
df=pd.read_csv(r'C:\Users\Satheesh.M\Downloads\archive (5)\chatgpt_reviews.csv')

In [18]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,32506090-1b40-4d11-8f97-9dae275f0a3e,Owami Ngalwana,I can't wait to see my pictures,5,0,1.2025.084,2025-04-07 14:04:04,1.2025.084
1,1c0db8e7-9f79-4422-8d67-1f03e67c341a,Ranjeet Singh,goo ha eh,1,0,1.2025.084,2025-04-07 14:04:04,1.2025.084
2,f6e50146-850f-4980-bc6e-1e6284504a70,Rhoman Sabbir,Nice,5,0,,2025-04-07 14:04:01,
3,5c27a028-3cb5-4f81-aad0-0d8016ef0e1e,Ah Sohag,wow,4,0,1.2025.084,2025-04-07 14:04:00,1.2025.084
4,46f8ec33-fc5f-4edc-a7d3-a870a1658e37,Arjun Sri,this apps feels me like a bro,5,0,1.2025.084,2025-04-07 14:03:53,1.2025.084


Data Preprocessing:

In [19]:
df.isnull().sum()

reviewId                    0
userName                    2
content                    12
score                       0
thumbsUpCount               0
reviewCreatedVersion    37906
at                          0
appVersion              37906
dtype: int64

In [20]:
df['content']=df['content'].astype(str).fillna('')

In [21]:
df.isnull().sum()

reviewId                    0
userName                    2
content                     0
score                       0
thumbsUpCount               0
reviewCreatedVersion    37906
at                          0
appVersion              37906
dtype: int64

In [22]:
gibli=df[df['content'].str.contains('gibli',na=False)]

In [23]:
scores=df['score'].value_counts().reset_index()
scores

Unnamed: 0,score,count
0,5,363942
1,4,51897
2,1,28010
3,3,19350
4,2,8186


Score Distribution Visualization:

In [24]:
custom_colors=['#204D00','#3D7317','#63993D','#87BB62','#AFDC8F']
fig=px.pie(
    scores,names='score',
    values='count',
    title='Score Distribution ',
    color_discrete_sequence=custom_colors)
fig.update_layout(
    width=500,
    height=500
)
fig.update_traces(
    textinfo='percent+label'
)
fig.show()

# Checking Polarity Postive / Negative / Neutral

Sentiment Analysis:

Method 1

In [25]:
df.isnull().sum()

reviewId                    0
userName                    2
content                     0
score                       0
thumbsUpCount               0
reviewCreatedVersion    37906
at                          0
appVersion              37906
dtype: int64

In [26]:
def get_sentiment(review):
    sentiment=TextBlob(review).polarity
    if sentiment>0:
        return 'Positive'
    elif sentiment<0:
        return "Negative"
    else:
        return "Neutral"

In [None]:
df['Sentiment']=df['content'].apply(get_sentiment)

In [None]:
df.head()

Method 2

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer=SentimentIntensityAnalyzer()

def get_sentiment_analyze(review):
    sentiment=analyzer.polarity_scores(review)['compound']
    if sentiment >= 0.05:
        return 'Positive'
    elif sentiment <= -0.05:
        return 'Negative'
    else:
        return "Neutral"


In [None]:
df['Sentiment']=df['content'].apply(get_sentiment_analyze)

In [None]:
df.head()

In [None]:
ex1=get_sentiment_analyze("This is a phone. It works.")
ex2=get_sentiment_analyze("I don't like this")
ex3=get_sentiment_analyze("I am extremely happy with my purchase!")
ex1,ex2,ex3

In [None]:
sentiment_distribution=df['Sentiment'].value_counts().reset_index()

In [None]:
fig=go.Figure(
    data=[
        go.Bar(
            x=sentiment_distribution['Sentiment'],
            y=sentiment_distribution['count'],
            marker_color=['green','gold','red'],
            width=0.5
        )
    ]
)
fig.update_layout(
    title='Sentiment Distribution of ChatGPT Reviews',
    xaxis_title='Sentiment',
    yaxis_title='Number Of Reviews',
    width=700,
    height=550
)
fig.show()

# Filter reviews with Positive sentiment

In [None]:
positive_reviews=df[df['Sentiment']=="Positive"]['content']
positive_reviews

In [None]:
# use CountVectorizer to extract common phrases (n-grams)
vectorizer=CountVectorizer(ngram_range=(2,3),stop_words='english',max_features=700)
postive_packed=vectorizer.fit_transform(positive_reviews)

In [None]:
postive_packed

In [None]:
vectorizer.get_feature_names_out()

In [None]:
positive_unpacked=pd.DataFrame(postive_packed.toarray(),columns=vectorizer.get_feature_names_out())

In [None]:
positive_distribution=positive_unpacked.sum().sort_values(ascending=False).reset_index()
positive_distribution.columns=['review','counts']
positive_distribution

# Top 15 Most phases

In [None]:
ps_top=positive_distribution.head(15)
fig=go.Figure(
    data=[
        go.Bar(
            y=ps_top['review'],
            x=ps_top['counts'],
            orientation='h',
        )
    ]
)
fig.update_layout(
    title='Top Common Phrases in Positive Reviews',
    xaxis_title='Frequency',
    yaxis_title='Phases',
             width=700,
            height=700,
            yaxis= {'categoryorder': 'total ascending'}
)
fig.show()

# All Postive Reviews 

In [None]:
fig=go.Figure(
    data=[
        go.Bar(
            y=positive_distribution['review'],
            x=positive_distribution['counts'],
            orientation='h',
            marker=dict(color='green')
        )
    ]
)
fig.update_layout(
    title='Top Common Phrases in Positive Reviews',
    xaxis_title='Frequency',
    yaxis_title='Phases',
             width=1000,
            height=5000,
            yaxis= {'categoryorder': 'total ascending'}
)
fig.show()

# Negative Reviews

# Summarize to Three Words :

In [None]:
negative_reviews=df[df['Sentiment']=="Negative"]['content']
vectorizer_2=CountVectorizer(ngram_range=(3,4),stop_words='english',max_features=200)
neg_packed=vectorizer_2.fit_transform(negative_reviews)
vectorizer_2.get_feature_names_out()

In [None]:
neg_unpack=pd.DataFrame(neg_packed.toarray(),columns=vectorizer_2.get_feature_names_out())
neg_distribution=neg_unpack.sum().sort_values(ascending=False).reset_index()
neg_distribution.columns=['review','counts']
neg_distribution

In [None]:
fig=go.Figure(
    data=[
        go.Bar(
            x=neg_distribution['counts'],
            y=neg_distribution['review'],
            orientation='h',
            marker=dict(color='firebrick')
        )
    ]
)
fig.update_layout(
    title='Top Common Phrases in Negative Reviews',
    yaxis_title='Frequency',
    xaxis_title='Phrase',
    width=1000,
    height=2000,
    yaxis={'categoryorder':'total ascending'}
)
fig.show()

# Summarize to Two Words :

In [None]:
negative_reviews_two=df[df['Sentiment']=="Negative"]['content']
vectorizer_two=CountVectorizer(ngram_range=(2,3),stop_words='english',max_features=200)
neg_packed=vectorizer_two.fit_transform(negative_reviews_two)
neg_unpack=pd.DataFrame(neg_packed.toarray(),columns=vectorizer_two.get_feature_names_out())
neg_distribution=neg_unpack.sum().sort_values(ascending=False).reset_index()
neg_distribution.columns=['review','counts']
neg_distribution

In [None]:
fig=go.Figure(
    data=[
        go.Bar(
            x=neg_distribution['counts'],
            y=neg_distribution['review'],
            orientation='h',
            marker=dict(color='crimson')
        )
    ]
)
fig.update_layout(
    title='Top Common Phrases in Negative Reviews',
    yaxis_title='Frequency',
    xaxis_title='Phrase',
    width=1000,
    height=2000,
    yaxis={'categoryorder':'total ascending'}
)
fig.show()

# Common Problems Faced By Users

In [None]:
incorrect_ans=neg_distribution[neg_distribution['review'].str.contains('wro\w*|wrong|incorrect|inaccurate|not|false|answer|error|time|ans\w*',case=False,regex=True)]

In [None]:
app_performence=neg_distribution[neg_distribution['review'].str.contains('wor\w*|bug|lag|crash|fix|load|freeze|bad|performance |horr\w*',case=False,regex=True)]

In [None]:
user_inter=neg_distribution[neg_distribution['review'].str.contains('button|interface|ui|layout|screen|button|design|confusing|use|difficult|look|view|like|scroll|display',case=False,regex=True)]

In [None]:
features=neg_distribution[neg_distribution['review'].str.contains('feature|avail\w*|mis\w*|working|broken|not|stop|pro\w*',case=False,regex=True)]

In [None]:
quality=neg_distribution[neg_distribution['review'].str.contains('res\w*|\w*less|less\w*|qual\w*|non\w*|response|ir\w*|info\w*|mis\w*',case=False,regex=True)]

In [None]:
common_problems={
    'Incorrect Answers' : incorrect_ans['counts'].sum(),
    'App Performance':app_performence['counts'].sum(),
    'User Interface':user_inter['counts'].sum(),
    'Features Missing/Not Working':features['counts'].sum(),
    'Quality of Responses':quality['counts'].sum()
}
common_problems

In [None]:
probelm_freq=pd.DataFrame(list(common_problems.items()),columns=['Problem','Frequency'])
probelm_freq

In [None]:
custom_colors=['#7f0000', '#b30000', '#fc8d59','#d7301f', '#fdbb84']
fig=px.bar(
    probelm_freq,
    x='Problem',
    y='Frequency',
    color='Problem',
    color_discrete_sequence=custom_colors
)
fig.update_layout(
    title='Common Problems Faced by Users in ChatGPT',
    xaxis={'categoryorder':'total descending'},
    width=1100,
    height=700
)
fig.show()

# Sentimet Over Month

In [None]:
df['at']=pd.to_datetime(df['at'])

In [None]:
sentiment_over_time=df.groupby([df['at'].dt.to_period('M'),'Sentiment']).size().unstack(fill_value=0)
sentiment_over_time

In [None]:
sentiment_over_time.index=sentiment_over_time.index.to_timestamp()

In [None]:
fig=go.Figure()

fig.add_trace(
    go.Scatter(
        x=sentiment_over_time.index,
        y=sentiment_over_time['Positive'],
        line=dict(color='green'),
        name='Positive'
    )
)

fig.add_trace(
    go.Scatter(
        x=sentiment_over_time.index,
        y=sentiment_over_time['Negative'],
        line=dict(color='red'),
        name='Negative'
    )
)

fig.add_trace(
    go.Scatter(
        x=sentiment_over_time.index,
        y=sentiment_over_time['Neutral'],
        line=dict(color='gold'),
        name='Neutral'
    )
)

fig.update_layout(
    width=1100,
    height=650,
    title='Sentiment Trends Over Time',
    xaxis_title='Date',
    yaxis_title='Reviews',
    legend_title_text='Sentiment',
    xaxis=dict(showgrid=True, gridcolor='lightgray'), 
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)
fig.show()

In [None]:
df.head()

In [None]:
app_ver=df['appVersion'].value_counts()
app_ver

In [None]:
fig=px.bar(
    app_ver,
    x=app_ver.index,
    y=app_ver.values,
    title='Distribution Of Versions'
)
fig.update_layout(
    width=1100,
    xaxis_title='Review',
    yaxis_title='Version '
)

In [None]:
version_sentiment=df.groupby([df['appVersion'],'Sentiment']).size().unstack(fill_value=0)
version_sentiment=version_sentiment.reset_index()
version_sentiment.head()

In [None]:
version_sentiment = df.groupby(['appVersion', 'Sentiment']).size().unstack(fill_value=0).reset_index()
version_sentiment

In [None]:
from packaging import version

version_sentiment['appVersion'] = version_sentiment['appVersion'].apply(version.parse)
version_sentiment = version_sentiment.sort_values(by='appVersion', ascending=False)
version_sentiment['appVersion'] = version_sentiment['appVersion'].astype(str)
version_sentiment

In [None]:
version_melted = version_sentiment.melt(
    id_vars='appVersion',
    value_vars=['Positive', 'Negative', 'Neutral'],
    var_name='Sentiment',
    value_name='Count'
)

version_melted.head()

In [None]:
version_melted = version_melted.sort_values('appVersion', ascending=False)

In [None]:
color_map = {
    'Positive': 'darkgreen',
    'Negative': 'crimson',
    'Neutral': 'gold'
}
fig=px.bar(
    version_melted,
    x='appVersion',
    y='Count',
    color='Sentiment',
    title='Sentiment Distribution Across App Versions',
    color_discrete_map=color_map,
    height=700,
    width=1000
)
fig.update_layout(barmode='stack')
fig.show()

# NPS Calculation

In [None]:
df['nps category']=df['score'].apply(lambda x:
                                    "Promoter" if x==5 else(
                                                             "Passive" if x==4 
                                                                       else "Detractor"
                                     )
                                    )

In [None]:
nps=df['nps category'].value_counts(normalize=True)*100
nps=nps.reset_index()
nps.columns=['category','percentage']
nps['percentage']=nps['percentage'].round(2)
nps

In [None]:
colors=['#4CAF50', '#FFC107', '#F44336']
fig=px.pie(
    nps,
    names=nps['category'],
    values=nps['percentage'],
    title='NPS Category Distribution (%)',
    color_discrete_sequence=colors
)
fig.update_layout(
    width=500,
    height=500
)
fig.show()

In [None]:
nps.loc[nps['category']=="Promoter",'percentage'].values[0]-nps.loc[nps['category']=='Detractor','percentage'].values[0]

The Net Promoter Score (NPS) for ChatGPT, based on the ratings provided in the dataset, is approximately 65.42

In [None]:
#pd.set_option('display.max_colwidth',None)
top_reviews=df.sort_values('thumbsUpCount' , ascending=False)[['userName','content','thumbsUpCount']]
top_reviews

In [None]:
def categorized_review(x):
    if x<=10:
        return "Low"
    elif x<=50:
        return "Medium"
    elif x<=200:
        return "High"
    elif x<=500:
        return "Very High"
    else:
        return "Viral"

In [None]:
top_reviews['helpfull']=top_reviews['thumbsUpCount'].apply(categorized_review)
top_reviews

In [None]:
help_category=top_reviews['helpfull'].value_counts()
help_category

In [None]:
without_low=help_category.drop('Low')
# Create subplot layout
fig=make_subplots(rows=1,cols=2,subplot_titles=("All Levels", "Excluding 'Low'"))

# Add both charts to subplots
fig.add_trace(
    go.Bar(
        x=help_category.index,
        y=help_category.values,
        name='All',
        marker_color='teal'
    ),
    row=1,col=1
)

fig.add_trace(
    go.Bar (
        x=without_low.index,
        y=without_low.values,
        name="Filtered  Low ",
        marker_color='orange',
        width=0.4
    ),
    row=1,col=2
)

fig.update_layout(
    height=500,
    width=1100,
    title_text="Helpfulness Distribution - Full vs Filtered",
    
    xaxis_title="Helpfulness Level (All)",
    yaxis_title="Review Count",

    xaxis2_title="Helpfulness Level (Filtered)",
    yaxis2_title="Review Count"
)
fig.show()

In [None]:
# Top 5 Data
top5=top_reviews.head(20).sort_values(by='thumbsUpCount',ascending=False)
top5.drop_duplicates(subset=['userName'],keep='first',inplace=True)
# Visualize Top 5
fig=px.bar(
    top5,
    x='userName',
    y='thumbsUpCount',
    hover_data={'content': True},
    text='thumbsUpCount',
    title='Top Most Liked Reviews',
    labels={'thumbsUpCount': 'Thumbs Up', 'userName': 'User'}
)
fig.update_layout(
)
fig.show()

In [None]:
pd.set_option('display.max_colwidth',None)
top5[['content','thumbsUpCount']].head()

# Trending Gibli Image Reviews

In [None]:
gibli=df[df['content'].str.contains('gibli|gibh\*w')]
gibli.head()

Postive Reviews

In [None]:
gibli_positive=gibli[gibli['Sentiment']=='Positive'][['userName','content']]
gibli_positive

Negative Reviews

In [None]:
gibli_negative=gibli[gibli['Sentiment']=='Negative'][['userName','content']]
gibli_negative

In [None]:
!jupyter nbconvert --to html chatGPT_review_2025.ipynb

In [None]:
!jupyter nbconvert chatGPT_review_2025.ipynb --to html --embed-images


In [None]:
top5.head()