## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os

## Import Data

In [3]:
#path to main folder
path = r'C:\Users\steve\Documents\Olist Marketplace Analysis'

# import order reviews file
reviews_df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_order_reviews_dataset.csv'), index_col = False)

## Analysis

#### 01. Content + Shape

In [8]:
reviews_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [10]:
reviews_df.shape

(99224, 7)

In [12]:
reviews_df['review_creation_date'].dtype

dtype('O')

In [14]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99224 entries, 0 to 99223
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                99224 non-null  object
 1   order_id                 99224 non-null  object
 2   review_score             99224 non-null  int64 
 3   review_comment_title     11568 non-null  object
 4   review_comment_message   40977 non-null  object
 5   review_creation_date     99224 non-null  object
 6   review_answer_timestamp  99224 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB


In [16]:
reviews_df.describe()

Unnamed: 0,review_score
count,99224.0
mean,4.086421
std,1.347579
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


#### 02. Value Counts

Order id

In [20]:
# get counts of order ids
reviews_df['order_id'].value_counts(dropna = False)

order_id
c88b1d1b157a9999ce368f218a407141    3
8e17072ec97ce29f0e1f111e598b0c85    3
df56136b8031ecd28e200bb18e6ddb2e    3
03c939fd7fd3b38f8485a0f95798f1f6    3
5cb890a68b91b6158d69257e4e2bc359    2
                                   ..
5b4e9a12d219f34f5c2de9f8d620b19d    1
a6da096d974acc000962856d7386448a    1
75e0647c26de647eca3421e9cc66c9da    1
bad0467c52f23cdc71e9fa139d4a8afd    1
90531360ecb1eec2a1fbb265a0db0508    1
Name: count, Length: 98673, dtype: int64

**Unique Order IDs** = 98,673

In [23]:
reviews_df['review_id'].value_counts(dropna = False)

review_id
7b606b0d57b078384f0b58eac1d41d78    3
dbdf1ea31790c8ecfcc6750525661a9b    3
32415bbf6e341d5d517080a796f79b5c    3
0c76e7a547a531e7bf9f0b99cba071c1    3
4219a80ab469e3fc9901437b73da3f75    3
                                   ..
95e01591b0e69a2fab382b0c562d4e20    1
93611e0327d6a1769d1e68cf3caa242d    1
983c47de74278257f99c4b918fd380f1    1
ca475b77fcc618551ef9d516c3f61b88    1
efe49f1d6f951dd88b51e6ccd4cc548f    1
Name: count, Length: 98410, dtype: int64

**Unique Review IDs** = 98,410

Score Review

In [27]:
# get counts of payment sequentials
reviews_df['review_score'].value_counts(dropna = False)

review_score
5    57328
4    19142
1    11424
3     8179
2     3151
Name: count, dtype: int64

**Unique Paymental Sequentials** = 5

Comment Title

In [31]:
# get counts of payment types
reviews_df['review_comment_title'].value_counts(dropna = False)

review_comment_title
NaN                          87656
Recomendo                      423
recomendo                      345
Bom                            293
super recomendo                270
                             ...  
Luminaria sobrepor led 18        1
Não recebi.                      1
produto com lacre violado        1
Atraso Entrega                   1
Foto enganosa                    1
Name: count, Length: 4528, dtype: int64

Comment Message

In [34]:
reviews_df['review_comment_message'].value_counts(dropna = False)

review_comment_message
NaN                                                                                                                                                                                                    58247
Muito bom                                                                                                                                                                                                230
Bom                                                                                                                                                                                                      189
muito bom                                                                                                                                                                                                122
bom                                                                                                                                                          

### CONSISTENCY CHECKS

#### 01. Mixed-Type Data

In [38]:
# check if there are any mixed-type columns
for col in reviews_df.columns.tolist():
    weird = (reviews_df[[col]].map(type) != reviews_df[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (reviews_df[weird]) > 0:
        print(col)

review_comment_title
review_comment_message


In [46]:
# convert both to type string
reviews_df['review_comment_title'] = reviews_df['review_comment_title'].astype("string")
reviews_df['review_comment_message'] = reviews_df['review_comment_message'].astype("string")

**2 Mixed Type:** Review Title & Review Comment changed from mixed to str

#### 02. Missing Values

In [50]:
# returns number of missing data by column
reviews_df.isnull().sum()

review_id                      0
order_id                       0
review_score                   0
review_comment_title       87656
review_comment_message     58247
review_creation_date           0
review_answer_timestamp        0
dtype: int64

**Missing values** will be left as they are due to customers choosing only to leave a score and not any title or message.

#### 03. Duplicates

In [33]:
# creates a subet containing only the full duplicates
df_dups = reviews_df[reviews_df.duplicated()]
df_dups

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp


**No full duplicates**

### Export Data

In [36]:
reviews_df.to_csv(os.path.join(path, '02 Data', 'Prepared Data','order_reviews_checked.csv'))