## Data Mining Project - Olist Sentiment Analysis

### Loading Libraries and Datasets

In [62]:
# %pip install tensorflow
# %pip install -U scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import datetime as dt

In [63]:
data_cust = pd.read_csv("datasets/olist_customers_dataset.csv")
data_order_item = pd.read_csv("datasets/olist_order_items_dataset.csv")
data_order_paym = pd.read_csv("datasets/olist_order_payments_dataset.csv")
data_order_review = pd.read_csv("datasets/olist_order_reviews_dataset.csv")
data_order_main = pd.read_csv("datasets/olist_orders_dataset.csv")
data_product = pd.read_csv("datasets/olist_products_dataset.csv")
data_sellers = pd.read_csv("datasets/olist_sellers_dataset.csv")
data_product_cate_name = pd.read_csv("datasets/product_category_name_translation.csv")

File Merger

In [64]:
compiled_df = data_cust.merge(data_order_main, how='inner', on='customer_id')
compiled_df = compiled_df.merge(data_order_review, how='inner', on='order_id')
compiled_df = compiled_df.merge(data_order_paym, how='inner', on='order_id')
compiled_df = compiled_df.merge(data_order_item, how='inner', on='order_id')
compiled_df = compiled_df.merge(data_product, how='inner', on='product_id')
compiled_df = compiled_df.merge(data_sellers, how='inner', on='seller_id')
compiled_df = compiled_df.merge(data_product_cate_name, how='inner', on='product_category_name')

In [65]:
compiled_df.dropna(subset=['order_delivered_carrier_date', 'order_delivered_customer_date','product_weight_g', 'product_length_cm', 'product_width_cm', 'product_height_cm']).reset_index()
compiled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115609 entries, 0 to 115608
Data columns (total 40 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   customer_id                    115609 non-null  object 
 1   customer_unique_id             115609 non-null  object 
 2   customer_zip_code_prefix       115609 non-null  int64  
 3   customer_city                  115609 non-null  object 
 4   customer_state                 115609 non-null  object 
 5   order_id                       115609 non-null  object 
 6   order_status                   115609 non-null  object 
 7   order_purchase_timestamp       115609 non-null  object 
 8   order_approved_at              115595 non-null  object 
 9   order_delivered_carrier_date   114414 non-null  object 
 10  order_delivered_customer_date  113209 non-null  object 
 11  order_estimated_delivery_date  115609 non-null  object 
 12  review_id                     

Feature Selection

In [66]:
# converting to datetime
compiled_df['order_purchase_timestamp'] = pd.to_datetime(compiled_df.order_purchase_timestamp)
compiled_df['order_approved_at'] = pd.to_datetime(compiled_df.order_approved_at).dt.date
compiled_df['order_estimated_delivery_date'] = pd.to_datetime(compiled_df.order_estimated_delivery_date).dt.date  
compiled_df['order_delivered_customer_date'] = pd.to_datetime(compiled_df.order_delivered_customer_date).dt.date  

# Drop columns
compiled_df1 = compiled_df[['order_status','freight_value', 'price',
                 'order_approved_at', 'order_estimated_delivery_date', 
                 'order_delivered_customer_date', 'customer_state', 
                 'product_category_name_english', 'product_name_lenght', 'product_description_lenght', 
                 'product_photos_qty', 'review_score', 'order_purchase_timestamp']]
compiled_df1


Unnamed: 0,order_status,freight_value,price,order_approved_at,order_estimated_delivery_date,order_delivered_customer_date,customer_state,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,review_score,order_purchase_timestamp
0,delivered,21.88,124.99,2017-05-16,2017-06-05,2017-05-25,SP,office_furniture,41.0,1141.0,1.0,4,2017-05-16 15:05:35
1,delivered,24.90,112.99,2017-11-10,2017-12-19,2017-11-28,PA,office_furniture,41.0,1141.0,1.0,1,2017-11-09 00:50:13
2,delivered,24.90,112.99,2017-11-10,2017-12-19,2017-11-28,PA,office_furniture,41.0,1141.0,1.0,1,2017-11-09 00:50:13
3,delivered,15.62,124.99,2017-05-08,2017-06-12,2017-05-26,RS,office_furniture,41.0,1141.0,1.0,3,2017-05-07 20:11:26
4,delivered,30.59,106.99,2018-02-04,2018-03-22,2018-02-28,RJ,office_furniture,41.0,1141.0,1.0,4,2018-02-03 19:45:40
...,...,...,...,...,...,...,...,...,...,...,...,...,...
115604,delivered,37.70,389.00,2018-03-31,2018-04-18,2018-04-06,SC,la_cuisine,59.0,284.0,2.0,4,2018-03-31 19:17:43
115605,delivered,10.81,24.00,2017-04-11,2017-05-04,2017-04-18,SP,la_cuisine,41.0,194.0,1.0,1,2017-04-09 20:04:37
115606,delivered,21.46,139.00,2017-08-28,2017-09-19,2017-09-05,SP,la_cuisine,59.0,731.0,1.0,5,2017-08-28 09:48:01
115607,delivered,26.18,129.00,2017-07-31,2017-08-28,2017-08-09,MG,la_cuisine,57.0,429.0,1.0,5,2017-07-31 18:10:29


Split of dataset

In [67]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(compiled_df1, compiled_df1['review_score']):
    strat_train_set = compiled_df1.loc[train_index]
    strat_test_set = compiled_df1.loc[test_index]

## Ordinal regression
### Feature Engineering
Aim: Predict rating score based on delivery performance

In [68]:
# %pip install statsmodels
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer to conduct feature engineering for both new/unseen data
class AttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        df = X.copy()
        
        # Calculate the estimated delivery time and actual delivery time in working days. 
        # If the order_delivered_customer_date is null, it returns 0.
        df['wd_estimated_delivery_time'] = df['order_estimated_delivery_date'] - df['order_approved_at']
        df['wd_actual_delivery_time'] = df['order_delivered_customer_date'] - df['order_approved_at']
        
        # Calculate the time between the actual and estimated delivery date. If negative was delivered early, if positive was delivered late.
        df['wd_delivery_time_delta'] = df.order_delivered_customer_date - df.order_estimated_delivery_date
        
        # Calculate the average product value.
        df['average_product_value'] = df.price / len(df)

        # Calculate the time between the actual and estimated delivery date. If negative was delivered early, if positive was delivered late.
        df['is_late'] = df.order_delivered_customer_date > df.order_estimated_delivery_date
        
        # Calculate the order freight ratio.
        df['purchase_dayofweek'] = df.order_purchase_timestamp.dt.dayofweek
        
        # With that we can remove the timestamps from the dataset
        cols_drop = ['order_purchase_timestamp', 'order_approved_at', 'order_estimated_delivery_date', 'order_delivered_customer_date']
        df.drop(cols_drop, axis=1, inplace=True)
        
        return df

In [69]:

orders_features = strat_train_set.drop(['order_status', 'customer_state', 'product_category_name_english','review_score'], axis=1)
orders_labels = strat_train_set['review_score'].copy()

attr_adder = AttributesAdder()
feat_eng = attr_adder.transform(orders_features)


Ordinal Regression Function ('is_late' vs. 'review_score')

In [71]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

# Ordered logit model
mod_log = OrderedModel(orders_labels,
                        feat_eng['is_late'],
                        distr='logit')

res_prob = mod_log.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 1.181392
         Iterations: 19
         Function evaluations: 22
         Gradient evaluations: 22


0,1,2,3
Dep. Variable:,review_score,Log-Likelihood:,-109260.0
Model:,OrderedModel,AIC:,218500.0
Method:,Maximum Likelihood,BIC:,218600.0
Date:,"Tue, 22 Nov 2022",,
Time:,18:51:33,,
No. Observations:,92487,,
Df Residuals:,92482,,
Df Model:,5,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
is_late,-2.3247,0.027,-86.629,0.000,-2.377,-2.272
1/2,-2.2019,0.011,-200.675,0.000,-2.223,-2.180
2/3,-1.1465,0.017,-66.039,0.000,-1.181,-1.113
3/4,-0.5409,0.011,-49.172,0.000,-0.562,-0.519
4/5,-0.0793,0.007,-11.273,0.000,-0.093,-0.065
