## Data Mining Project - Olist Sentiment Analysis

### Loading Libraries and Datasets

In [1]:
# %pip install tensorflow
# %pip install -U scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR

In [3]:
data_cust = pd.read_csv("datasets/olist_customers_dataset.csv")
data_order_item = pd.read_csv("datasets/olist_order_items_dataset.csv")
data_order_paym = pd.read_csv("datasets/olist_order_payments_dataset.csv")
data_order_review = pd.read_csv("datasets/olist_order_reviews_dataset.csv")
data_order_main = pd.read_csv("datasets/olist_orders_dataset.csv")
data_product = pd.read_csv("datasets/olist_products_dataset.csv")
data_sellers = pd.read_csv("datasets/olist_sellers_dataset.csv")
data_product_cate_name = pd.read_csv("datasets/product_category_name_translation.csv")

File Merger

In [4]:
compiled_df = data_cust.merge(data_order_main, how='inner', on='customer_id')
compiled_df = compiled_df.merge(data_order_review, how='inner', on='order_id')
compiled_df = compiled_df.merge(data_order_paym, how='inner', on='order_id')
compiled_df = compiled_df.merge(data_order_item, how='inner', on='order_id')
compiled_df = compiled_df.merge(data_product, how='inner', on='product_id')
compiled_df = compiled_df.merge(data_sellers, how='inner', on='seller_id')
compiled_df = compiled_df.merge(data_product_cate_name, how='inner', on='product_category_name')

In [5]:
compiled_df = compiled_df.dropna(subset=['order_delivered_carrier_date', 'order_delivered_customer_date',\
     'product_weight_g', 'product_length_cm', 'product_width_cm', 'product_height_cm']).reset_index()

Feature Selection

In [6]:
cols_to_drop = ['index', 'customer_id', 'customer_unique_id', 'customer_city', 'customer_state', 'order_id', 'review_id', 'review_comment_title', 'review_comment_message',\
     'review_creation_date', 'review_answer_timestamp', 'payment_sequential', 'payment_type', 'payment_installments', 'order_item_id', 'product_id',\
        'seller_id', 'seller_state', 'seller_city', 'product_category_name']
compiled_df.drop(cols_to_drop, axis=1, inplace=True)

In [7]:
compiled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113207 entries, 0 to 113206
Data columns (total 21 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   customer_zip_code_prefix       113207 non-null  int64  
 1   order_status                   113207 non-null  object 
 2   order_purchase_timestamp       113207 non-null  object 
 3   order_approved_at              113193 non-null  object 
 4   order_delivered_carrier_date   113207 non-null  object 
 5   order_delivered_customer_date  113207 non-null  object 
 6   order_estimated_delivery_date  113207 non-null  object 
 7   review_score                   113207 non-null  int64  
 8   payment_value                  113207 non-null  float64
 9   shipping_limit_date            113207 non-null  object 
 10  price                          113207 non-null  float64
 11  freight_value                  113207 non-null  float64
 12  product_name_lenght           

In [8]:
compiled_df['volume'] = compiled_df.apply(lambda x: x['product_length_cm'] * x['product_height_cm'] * x['product_width_cm'], axis = 1)

In [9]:
compiled_df['order_purchase_timestamp'] = pd.to_datetime(compiled_df['order_purchase_timestamp'])
compiled_df['order_approved_at'] = pd.to_datetime(compiled_df['order_approved_at'])
compiled_df['order_delivered_carrier_date'] = pd.to_datetime(compiled_df['order_delivered_carrier_date'])
compiled_df['order_delivered_customer_date'] = pd.to_datetime(compiled_df['order_delivered_customer_date'])
compiled_df['order_estimated_delivery_date'] = pd.to_datetime(compiled_df['order_estimated_delivery_date'])
compiled_df['days_to_deliver'] = compiled_df.apply(lambda x: (x['order_delivered_customer_date'] - x['order_purchase_timestamp']).days, axis = 1)


In [10]:
compiled_df.drop(['product_height_cm', 'product_length_cm', 'product_width_cm', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',\
    'order_delivered_customer_date', 'order_estimated_delivery_date', 'shipping_limit_date', 'order_status', 'product_category_name_english',\
        'customer_zip_code_prefix', 'seller_zip_code_prefix'], axis=1, inplace=True)

In [11]:
compiled_df = compiled_df.sample(10000)
sc_x = StandardScaler()
output_var = compiled_df['review_score']
input_var = compiled_df.drop('review_score', axis=1)
input_var = sc_x.fit_transform(input_var)
X_train, X_test, y_train, y_test = train_test_split(input_var, output_var, test_size=0.3, random_state=42, stratify=output_var)

In [12]:
svr = SVR(kernel='linear', verbose = True)
svr.fit(X_train,y_train)
sv_preds = svr.predict(X_test)
print("Model score on Training Data = ", svr.score(X_train,y_train))
print("Model score on Testing Data = ", svr.score(X_test,y_test))

[LibSVM]Model score on Training Data =  -0.06384995330307275
Model score on Testing Data =  -0.08198300294548422


In [13]:
# Predicting the target values of the test set
# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, sv_preds)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  1.404


In [14]:
sv_preds

array([4.95544194, 4.95990911, 4.72109623, ..., 4.53332364, 4.34777629,
       4.65888425])