In [1]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import plotly.express as px

In [2]:
df_reviews = pd.read_csv('olist_reviews.csv')

df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7271 entries, 0 to 7270
Data columns (total 31 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   merchant_id                             7271 non-null   object 
 1   lead_id                                 7271 non-null   object 
 2   sdr_id                                  7271 non-null   object 
 3   sr_id                                   7271 non-null   object 
 4   business_segment                        7270 non-null   object 
 5   lead_behavior                           4785 non-null   object 
 6   has_company                             100 non-null    object 
 7   has_gtin                                101 non-null    object 
 8   average_stock                           103 non-null    object 
 9   business_type                           7259 non-null   object 
 10  declared_product_catalog_size           69 non-null     floa

In [4]:
reviews = df_reviews[['review_score','order_id','review_comment_title','review_comment_message']]

reviews.info()

# Replace empty strings with NaN (null values)
reviews = reviews.replace('', pd.np.nan)

reviews.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7271 entries, 0 to 7270
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   review_score            6780 non-null   float64
 1   order_id                6809 non-null   object 
 2   review_comment_title    2316 non-null   object 
 3   review_comment_message  2805 non-null   object 
dtypes: float64(1), object(3)
memory usage: 227.3+ KB


  reviews = reviews.replace('', pd.np.nan)


Unnamed: 0,review_score,order_id,review_comment_title,review_comment_message
0,,,,
1,5.0,6826d5740591909e368433e2a5ec75fb,,"Otima, chegou até antes do prazo, e conforme o..."
2,5.0,d3582fd5ccccd9cb229a63dfb417c86f,,
3,5.0,ed76528f7ed5ae1f2a0dd070a7426d44,,
4,,,,


In [5]:
#Drop what does not have review_score

reviews = reviews.dropna(subset=['review_score'])
reviews.head()


Unnamed: 0,review_score,order_id,review_comment_title,review_comment_message
1,5.0,6826d5740591909e368433e2a5ec75fb,,"Otima, chegou até antes do prazo, e conforme o..."
2,5.0,d3582fd5ccccd9cb229a63dfb417c86f,,
3,5.0,ed76528f7ed5ae1f2a0dd070a7426d44,,
5,4.0,0c89556cf6859f7e4b104f2883aa1b97,Recomendo,Deveria ser mais barato!
6,4.0,0c89556cf6859f7e4b104f2883aa1b97,Recomendo,Deveria ser mais barato!


In [6]:
#New column for comments/title or non

reviews['text_general'] = 0

reviews.loc[~reviews['review_comment_title'].isnull() | ~reviews['review_comment_message'].isnull(), 'text_general'] = 1

reviews

Unnamed: 0,review_score,order_id,review_comment_title,review_comment_message,text_general
1,5.0,6826d5740591909e368433e2a5ec75fb,,"Otima, chegou até antes do prazo, e conforme o...",1
2,5.0,d3582fd5ccccd9cb229a63dfb417c86f,,,0
3,5.0,ed76528f7ed5ae1f2a0dd070a7426d44,,,0
5,4.0,0c89556cf6859f7e4b104f2883aa1b97,Recomendo,Deveria ser mais barato!,1
6,4.0,0c89556cf6859f7e4b104f2883aa1b97,Recomendo,Deveria ser mais barato!,1
...,...,...,...,...,...
7266,5.0,e88f34a80788f768c175e94c622d7559,,,0
7267,5.0,e88f34a80788f768c175e94c622d7559,,,0
7268,4.0,ecb8849b9673c0bc2a510456938b508b,,,0
7269,5.0,f7975233cce5d29114cc23919f1cc8d4,,,0


In [10]:
reviews[['review_score','text_general']]

# group the DataFrame by "review_score", "text_general", and another level to get the total for each "review_score"
grouped = reviews.groupby(['review_score', 'text_general', 'order_id']).size().reset_index(name='count')

# calculate some aggregate statistics for each group, such as the count of rows, the mean of the "review_score" column, and the total for each "review_score"
stats = grouped.groupby(['review_score', 'text_general']).agg({'count': 'sum', 'review_score': 'mean'})


stats

Unnamed: 0_level_0,Unnamed: 1_level_0,count,review_score
review_score,text_general,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0,86,1.0
1.0,1,756,1.0
2.0,0,45,2.0
2.0,1,266,2.0
3.0,0,216,3.0
3.0,1,230,3.0
4.0,0,770,4.0
4.0,1,363,4.0
5.0,0,2520,5.0
5.0,1,1528,5.0


In [19]:
filtered_reviews = reviews[reviews['text_general'].isin([0])]

filtered_reviews.describe()

Unnamed: 0,review_score,text_general
count,3637.0,3637.0
mean,4.537806,0.0
std,0.851784,0.0
min,1.0,0.0
25%,4.0,0.0
50%,5.0,0.0
75%,5.0,0.0
max,5.0,0.0


In [20]:
filtered_reviews = reviews[reviews['text_general'].isin([1])]

filtered_reviews.describe()

Unnamed: 0,review_score,text_general
count,3143.0,3143.0
mean,3.522113,1.0
std,1.683807,0.0
min,1.0,1.0
25%,2.0,1.0
50%,4.0,1.0
75%,5.0,1.0
max,5.0,1.0


In [23]:
df_reviews.info()


seller_cluster = df_reviews[['seller_id','review_score']]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7271 entries, 0 to 7270
Data columns (total 31 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   merchant_id                             7271 non-null   object 
 1   lead_id                                 7271 non-null   object 
 2   sdr_id                                  7271 non-null   object 
 3   sr_id                                   7271 non-null   object 
 4   business_segment                        7270 non-null   object 
 5   lead_behavior                           4785 non-null   object 
 6   has_company                             100 non-null    object 
 7   has_gtin                                101 non-null    object 
 8   average_stock                           103 non-null    object 
 9   business_type                           7259 non-null   object 
 10  declared_product_catalog_size           69 non-null     floa

In [27]:
seller_cluster = seller_cluster.dropna(subset=['seller_id'])

grouped_reviews = seller_cluster.groupby(['seller_id', 'review_score']).size().reset_index(name='count')

grouped_reviews

Unnamed: 0,seller_id,review_score,count
0,01266d4c46afa519678d16a8b683d325,5.0,3
1,01fd077212124329bac32490e8ef80d9,1.0,4
2,01fd077212124329bac32490e8ef80d9,2.0,3
3,01fd077212124329bac32490e8ef80d9,3.0,1
4,01fd077212124329bac32490e8ef80d9,4.0,9
...,...,...,...
909,ffad1e7127fb622cb64a900751590acd,5.0,29
910,ffc470761de7d0232558ba5e786e57b7,1.0,4
911,ffc470761de7d0232558ba5e786e57b7,3.0,7
912,ffc470761de7d0232558ba5e786e57b7,4.0,3


In [28]:
# save the DataFrame to a CSV file

grouped_reviews.to_csv('grouped_reviews.csv', index=False)