In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn import neighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing 
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ConfusionMatrixDisplay


from imblearn.over_sampling import SMOTE 
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning

In [9]:
df = pd.read_csv('dataco_data_filtered.csv')
#1. remove data after 2017
df['Order date (DateOrders)'] = pd.to_datetime(df['Order date (DateOrders)'])
df = df.loc[df['Order date (DateOrders)']< pd.to_datetime('2015-03-01')]

#2.add weekday dummy
df['weekday']=np.where((df['Order date (DateOrders)'].dt.dayofweek) <5, 0,1)

#3.agreggate date 
number = df.groupby("Order date (DateOrders)")['Order Item Quantity'].count().reset_index()
number = number.rename(columns={"Order Item Quantity": "Total Volume"})
df = pd.merge(df,number,left_on='Order date (DateOrders)',right_on='Order date (DateOrders)')

#sample 10,000 from our data
# df = df.sample(n=10000)

df = df.drop(['Order date (DateOrders)'],axis =1)
def reduce_class(col,df,n):
    count_df = df.groupby(col).size().reset_index().rename(columns={0 : 'count'})
    sort_df = count_df.sort_values('count',ascending = False)
    top_n = list(sort_df[col][:n])
    reduced_col = df[col].apply(lambda x: x if x in top_n else 'Other')
    return reduced_col

df['Category Name'] = reduce_class('Category Name',df,10)
df['Store City'] = reduce_class('Store City',df,10)
df['Customer State'] = reduce_class('Customer State',df,10)
# cat_count = df.groupby("Category Name")[['Category Name']].count().rename(columns={'Category Name' : 'count'}).reset_index()
# sorted_cats = cat_count.sort_values("count", ascending = False)
# top10_cats = sorted_cats['Category Name'][:10].to_list()
# df['Category Name'] = df['Category Name'].apply(lambda x: x if x in top10_cats else 'Other')

### classification data + classification models

In [10]:
dummy_columns = ['Order State','Order City','Customer Id','Product Name','Category Name', 'Customer Segment','Customer State', 'Store Country', 'Department Name (Store)','Store City', 'Market', 'Order Region','Order Country', 'Shipping Mode']#'Order City','Order State'

le = preprocessing.LabelEncoder()
encoder_df = df[dummy_columns].apply(le.fit_transform)

data_class = df
data_class[dummy_columns]=encoder_df
#data_class = data_class.drop(['Product Name','Order State','Store City','Order City','Customer Id','Order date (DateOrders)'],axis =1)

#get class for profit margin
#profit margin categorial name: 0-loss,1-low,2-medium,3-high
data_class['profit_cat']='loss'
pos_profit = data_class.loc[data_class['Order Item Profit Ratio']>=0]['Order Item Profit Ratio']
pos_profit_class = pd.qcut(pos_profit, 3, labels=["low", "medium", "high"])
# pos_profit_class = pd.qcut(pos_profit, 2, labels=["low", "high"])
data_class.loc[data_class['Order Item Profit Ratio']>=0,'profit_cat']=pos_profit_class

#binary 
data_class['loss']=1
data_class.loc[data_class['Order Item Profit Ratio']>=0,'loss']=0

data_class['high_profit'] = 0
cut_quantile = 0.9
q_cut = float(data_class['Order Item Profit Ratio'].quantile([cut_quantile]))
data_class.loc[df['Order Item Profit Ratio']>=q_cut,'high_profit']=1

data_class['class']=1
data_class.loc[df['Order Item Profit Ratio']>=q_cut,'class']=2
data_class.loc[(df['Order Item Profit Ratio']<q_cut)&(df['Order Item Profit Ratio']>=0),'class']=0



# High profit (90 quantile) Binary

In [27]:
X = data_class.drop(['Order Item Profit Ratio','profit_cat','loss','high_profit','class'],axis =1)
y = data_class['high_profit']
X = StandardScaler().fit_transform(X)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print('RF, accuracy:\n ',accuracy_score(y_test, y_pred))
print('RF, c_m: \n',confusion_matrix(y_test,y_pred))

# High profit (90 quantile) 3 classes

In [300]:
X = data_class.drop(['Order Item Profit Ratio','profit_cat','loss','high_profit','class'],axis =1)
X_cols = list(X.columns)
scaler = StandardScaler()
scaler.fit(X)
X = StandardScaler().fit_transform(X)
y = data_class['class']
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.50)
# X_train = StandardScaler().fit_transform(X_train)
# X_test = StandardScaler().fit_transform(X_test)


clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print('RF, accuracy:\n ',accuracy_score(y_test, y_pred))
print('RF, c_m: \n',confusion_matrix(y_test,y_pred))

RF, accuracy:
  0.782422293676313
RF, c_m: 
 [[3037  224  123]
 [ 782 2290  415]
 [ 509  180 2703]]


# Discount Rate Optimization and Calculate Expected Profit

In [None]:
X = data_class.drop(['class','Order Item Profit Ratio','profit_cat','loss','high_profit'], axis=1)#.to_numpy()
X1 = X.to_numpy()
X = pd.DataFrame(StandardScaler().fit_transform(X),index =X.index, columns = X.columns)
discounts = X['Order Item Discount Rate'].unique()

best_discount_list = list()
optimal_class_list = list()
original_class_list = list()
intitial_discount_list =list()

#number of samples
sample_size = 10000
count = 0
# for i in range(len(X_test)):
for i in range(sample_size):
    count+=1
    print(count)
    sample = X.iloc[i]
    initial_discount = sample['Order Item Discount Rate']
    intitial_discount_list.append(initial_discount)
    sample = sample.values.reshape(1, -1)
#     sample = StandardScaler().fit_transform(sample)
    y_pred = clf.predict(sample)
    initial_class = int(y_pred)
    original_class_list.append(initial_class)
    y_pred_dict= dict()
    if initial_class == 2:
        best_discount_list.append(initial_discount)
        optimal_class_list.append(initial_class)
        continue
    else:
        for discount in discounts:
            sample = X.iloc[i]
            sample['Order Item Discount Rate'] = discount
            sample = sample.values.reshape(1, -1)
#             sample = StandardScaler().fit_transform(sample)
            y_pred = clf.predict(sample)
            y_pred_dict[discount]=int(y_pred) 
        best_discount = max(y_pred_dict, key=y_pred_dict.get)
        pred_class = max(y_pred_dict.values())
        if pred_class > initial_class:
            best_discount_list.append(best_discount)
            print('change',best_discount)
            optimal_class_list.append(pred_class)
        else:
            best_discount_list.append(initial_discount)
            optimal_class_list.append(initial_class)



In [None]:
optimal_df = data_class.iloc[0:sample_size]
optimal_df ['Optimal Discount'] = best_discount_list

#     optimal_df = scaler.inverse_transform(optimal_df)
#     optimal_df =  pd.DataFrame(optimal_df,columns = X_cols)


optimal_df ['Optimal Class'] = optimal_class_list
optimal_df ['Initial Class'] = original_class_list
optimal_df ['Initial Discount'] = intitial_discount_list
optimal_df[['Order Item Profit Ratio','Initial Discount','Optimal Discount','Initial Class','Optimal Class','Order Item Total']]


In [None]:
change = optimal_df.loc[optimal_df['Initial Class'] != optimal_df['Optimal Class']]
change = change[['Order Item Profit Ratio','Initial Discount','Optimal Discount','Initial Class','Optimal Class','Order Item Total']]
change

In [342]:
original_profit = sum(change['Order Item Profit Ratio']*change['Order Item Total'])

In [None]:
original_profit = sum(change['Order Item Profit Ratio']*change['Order Item Total'])

In [348]:
class_2 = len(change['Optimal Class'] ==2)
sales_2 = change.loc[change['Optimal Class'] ==2]['Order Item Total'].sum()
class_1 = len(change['Optimal Class'] ==1)
sales_1 = change.loc[change['Optimal Class'] ==1]['Order Item Total'].sum()

In [353]:
sales_2

43291.15052831

In [359]:
expected_profit = (0.876 * 0.482 + 0.035 * 0.257 + 0.088 * -0.652) * sales_2 + (0.109 * 0.482 + 0.723 * 0.257 +0.168 * -0.652)*sales_1

In [362]:
expected_profit

26634.053342411935

In [361]:
original_profit

13441.332760141302

In [360]:
expected_profit - original_profit

13192.720582270633

# Feature Selection 

In [None]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
rfe = RFECV(estimator=RandomForestClassifier())
model = RandomForestClassifier()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
n_scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
#forward selection
from sklearn.feature_selection import SelectFromModel

X = data_class.drop(['Order Item Profit Ratio','profit_cat','loss','high_profit'],axis =1)
y = data_class['high_profit']
X = StandardScaler().fit_transform(X)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.50)
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X_train, y_train)

X_important_train = sel.transform(X_train)
X_important_test = sel.transform(X_test)

clf_important = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)
y_pred = clf_important.predict(X_important_test)
print('RF, accuracy:\n ',accuracy_score(y_test, y_pred))
print('RF, c_m: \n',confusion_matrix(y_test,y_pred))