In [1]:
import numpy as np
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
import matplotlib.pyplot as plt
import pandas as pd
from itertools import accumulate
import seaborn as sns
from prettytable import PrettyTable
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import statsmodels.api as sm
from statsmodels.formula.api import ols
import re
import nltk
from nltk.corpus import stopwords
import spacy
import matplotlib.pylab as plt
import matplotlib.patches as patches
from sklearn.metrics import roc_curve,auc
from scipy import interp
import pickle
from sklearn.metrics import f1_score

  import pandas.util.testing as tm


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/Colab Notebooks/Colab Notebooks/AppliedAI/Assign23_Self_Case_Study1/Data/

/content/drive/MyDrive/Colab Notebooks/Colab Notebooks/AppliedAI/Assign23_Self_Case_Study1/Data


In [4]:
# Use pickle to load in the pre-trained model.
dbfile = open('logreg.pkl', 'rb')     
model = pickle.load(dbfile)

In [5]:
def response_coding(df, feature):
  str1="class_1_"+feature
  str2="class_0_"+feature
  unique_cat_labels = df[feature].unique()
  row_col1 = []
  row_col2 = []
  row_col3 = []
  for i in range(len(unique_cat_labels)):
    total_count = df.loc[:,feature][(df[feature] == unique_cat_labels[i])].count()
    p_0 = df.loc[:, feature][((df[feature] == unique_cat_labels[i]) & (df['class_label']==0))].count()
    p_1 = df.loc[:, feature][((df[feature] == unique_cat_labels[i]) & (df['class_label']==1))].count()

    row_col1.append(unique_cat_labels[i])
    row_col2.append(p_1/total_count)
    row_col3.append(p_0/total_count)
  df_rc = pd.DataFrame(list(zip(row_col1, row_col2,row_col3)),
               columns =[feature,str1,str2])
  return df_rc

def preprocess(df):
  df["order_purchase_timestamp"] = pd.to_datetime(df["order_purchase_timestamp"], format='%Y-%m-%d %H:%M:%S')
  df["order_delivered_carrier_date"] = pd.to_datetime(df["order_delivered_carrier_date"], format='%Y-%m-%d %H:%M:%S')
  df["order_delivered_customer_date"] = pd.to_datetime(df["order_delivered_customer_date"], format='%Y-%m-%d %H:%M:%S')
  df["order_estimated_delivery_date"] = pd.to_datetime(df["order_estimated_delivery_date"], format='%Y-%m-%d %H:%M:%S')
  df["shipping_limit_date"] = pd.to_datetime(df["shipping_limit_date"], format='%Y-%m-%d %H:%M:%S')

  df['days_diff_deivery'] = (df.order_delivered_customer_date - df.order_purchase_timestamp)/np.timedelta64(1, 'D')
  df['days_diff_est_del'] = (df.order_estimated_delivery_date - df.order_delivered_customer_date)/np.timedelta64(1, 'D')
  df['days_diff_shp_del'] = (df.shipping_limit_date - df.order_delivered_customer_date)/np.timedelta64(1, 'D')

  df['product_category_name'].fillna(value=df['product_category_name'].mode()[0], inplace=True)
  df['product_name_lenght'].fillna(value=df['product_name_lenght'].mode()[0], inplace=True)
  df['product_description_lenght'].fillna(value=df['product_description_lenght'].median(), inplace=True)
  df['product_photos_qty'].fillna(value=df['product_photos_qty'].mode()[0], inplace=True)
  df['product_weight_g'].fillna(value=df['product_weight_g'].mode()[0], inplace=True)
  df['product_length_cm'].fillna(value=df['product_length_cm'].mode()[0], inplace=True)
  df['product_height_cm'].fillna(value=df['product_height_cm'].mode()[0], inplace=True)
  df['product_width_cm'].fillna(value=df['product_width_cm'].mode()[0], inplace=True)

  seller_city_rc = response_coding(df,'seller_city')
  seller_state_rc = response_coding(df,'seller_state')
  seller_zip_code_prefix_rc = response_coding(df,'seller_zip_code_prefix')
  customer_zip_code_prefix_rc = response_coding(df,'customer_zip_code_prefix')
  customer_city_rc = response_coding(df,'customer_city')
  customer_state_rc = response_coding(df,'customer_state')
  res_tmp1_df = pd.merge(df, seller_city_rc, on="seller_city")
  res_tmp2_df = pd.merge(res_tmp1_df, seller_state_rc, on="seller_state")
  res_tmp3_df = pd.merge(res_tmp2_df, seller_zip_code_prefix_rc, on="seller_zip_code_prefix")
  res_tmp4_df = pd.merge(res_tmp3_df, customer_zip_code_prefix_rc, on="customer_zip_code_prefix")
  res_tmp5_df = pd.merge(res_tmp4_df, customer_city_rc, on="customer_city")
  res_df = pd.merge(res_tmp5_df, customer_state_rc, on="customer_state")
  final_df = res_df[['payment_value','price', 'freight_value','product_name_lenght','product_description_lenght', 'product_photos_qty','product_weight_g','product_length_cm', 'product_height_cm','product_width_cm','days_diff_deivery', 'days_diff_est_del', 'class_1_seller_city','class_0_seller_city','class_1_seller_state','days_diff_shp_del','class_0_seller_state','class_1_seller_zip_code_prefix','class_0_seller_zip_code_prefix','class_1_customer_zip_code_prefix','class_0_customer_zip_code_prefix','class_1_customer_city', 'class_0_customer_city','class_1_customer_state', 'class_0_customer_state']]
  class_label_df = res_df['class_label']
  return final_df,class_label_df

def scaler(df):
  scaler = StandardScaler()
  df_scaled = scaler.fit_transform(df)
  return df_scaled

def main_predict(df):
  preprocess_df,label_df = preprocess(df)
  df_scaled = scaler(preprocess_df)
  prediction = model.predict(df_scaled)
  return prediction,label_df

def main_predict_metric(act,pred):
  f1 = f1_score(act, pred, average='macro')
  return f1



In [8]:
df = pd.read_csv("merge_data.csv")
pred_df,label_df = main_predict(df)
f1_metric = main_predict_metric(label_df,pred_df)
print("F1_Score : ",f1_metric)

F1_Score :  0.8838742724469084
