In [1]:
import pandas as pd
import numpy as np
from plotnine import *
import re
import matplotlib.pylab as plt
import matplotlib.dates as mdates
import math

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import random

In [3]:
df = pd.read_csv("../data/Sample - Superstore.csv", encoding='windows-1254')
df = df[df["Discount"] == 0]
df["zip_sub"] = (df["Postal Code"] / 100).apply("floor").astype(int)


In [4]:
df["zip_sub"].value_counts()

100    650
900    466
941    336
981    293
482    109
      ... 
665      1
297      1
488      1
944      1
507      1
Name: zip_sub, Length: 248, dtype: int64

In [5]:
X = df[["zip_sub", "Quantity", "Sales", "Sub-Category", "Ship Mode"]]
# X = df[["Region", "Sub-Category", "Sales", "Quantity"]]
y1 = df[["Segment"]]
print(y1)
y1 = y1.reset_index()
X = X.reset_index()
X["Ship Mode"] = X["Ship Mode"].map({"Standard Class": 1,
                              "Second Class": 2,
                              "First Class": 3,
                              "Same Day": 4})
y1["Segment"] = y1["Segment"].map({"Corporate": 1,
                              "Consumer": 2,
                              "Home Office": 3})
# y1["Category"] = y1["Category"].map({"Office Supplies": 1,
#                               "Furniture": 2,
#                               "Technology": 3})
# X["Region"] = X["Region"].map({"West": 1,
#                               "East": 2,
#                               "Central": 3,
#                               "South": 4})
X["Sub-Category"] = X["Sub-Category"].map({"Paper": 1,
                                          "Furnishings": 2,
                                          "Storage": 3,
                                          "Art": 4,
                                          "Accessories": 5,
                                          "Binders": 6,
                                          "Phones": 7,
                                          "Appliances": 8,
                                          "Labels": 9,
                                          "Envelopes": 10,
                                          "Chairs": 11,
                                          "Fasteners": 12,
                                          "Supplies": 13,
                                          "Tables": 14,
                                          "Bookcases": 15,
                                          "Machines": 16,
                                          "Copiers": 17})

print(X)
X.rename(columns={"index": "metadata_rowid"}, inplace=True)
y1.rename(columns={"index": "metadata_rowid"}, inplace=True)


assert (X.shape[0] == y1.shape[0])
y1.drop(columns = "metadata_rowid",inplace=True)


### using built-in function


        Segment
0      Consumer
1      Consumer
2     Corporate
5      Consumer
6      Consumer
...         ...
9987  Corporate
9988  Corporate
9990   Consumer
9992   Consumer
9993   Consumer

[4798 rows x 1 columns]
      index  zip_sub  Quantity   Sales  Sub-Category  Ship Mode
0         0      424         2  261.96            15          2
1         1      424         3  731.94            11          2
2         2      900         2   14.62             9          2
3         5      900         7   48.86             2          1
4         6      900         4    7.28             4          1
...     ...      ...       ...     ...           ...        ...
4793   9987      306         1   79.99             5          1
4794   9988      306         5  206.10             7          1
4795   9990      926         2   91.96             2          1
4796   9992      926         4   29.60             1          1
4797   9993      926         2  243.16             8          2

[4798 rows x 6

In [14]:
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, 
                                                   test_size = 0.25,
                                                   random_state = 221)
X_train = X_train[X_train.index.isin(y1_train.index)]
non_feat = ['metadata_rowid']
dt = DecisionTreeClassifier(random_state=0, max_depth = 10)
dt.fit(X_train[[col for col in X_train.columns if col not in 
                   non_feat]], y1_train)



y1_pred = dt.predict(X_test[[col for col 
                in X_test.columns if col not in non_feat]])
y1_predprob = dt.predict_proba(X_test[[col for col 
                in X_test.columns if col not in non_feat]])


feat_imp = pd.DataFrame({'feature_imp': dt.feature_importances_,
                         'feature_name': 
                        [col for col in X_train.columns if col not in non_feat]})
feat_imp.sort_values(by = 'feature_imp', ascending = False)

y1_pred_df = pd.DataFrame({'y_pred': y1_pred,
                                  'y_true': y1_test["Segment"]})

In [15]:
df.Region.value_counts()
df["Segment"].value_counts()

Consumer       2461
Corporate      1427
Home Office     910
Name: Segment, dtype: int64

In [16]:
print(feat_imp.sort_values(by = 'feature_imp', ascending = False))
print(y1_pred_df)
y1_pred_df.sort_values(by = "y_pred", ascending = False)
y1_pred_df.sort_values(by = "y_true", ascending = False)
print(len(y1_pred_df[y1_pred_df["y_pred"] == y1_pred_df["y_true"]]))
print("^^\n")


#print(y_pred_df_internal.head())
#print(y_pred_df_internal.sample(n = 10, random_state = 4484))
top_feat = feat_imp.sort_values(by = 'feature_imp', ascending = False)[0:10]
top_feat_list = top_feat.feature_name.to_list()

   feature_imp  feature_name
0     0.329691       zip_sub
2     0.270707         Sales
3     0.154600  Sub-Category
4     0.123355     Ship Mode
1     0.121647      Quantity
      y_pred  y_true
1352       3       3
1563       2       2
2515       2       3
1726       2       2
1047       2       2
...      ...     ...
2173       1       2
2060       2       3
3679       1       2
4232       2       1
2863       2       2

[1200 rows x 2 columns]
618
^^



In [18]:
y_pred_df_str = y1_pred_df.replace({1: "Corporate",
                                   2: "Consumer",
                                   3: "Home Office"})
y_pred_df_str

Unnamed: 0,y_pred,y_true
1352,Home Office,Home Office
1563,Consumer,Consumer
2515,Consumer,Home Office
1726,Consumer,Consumer
1047,Consumer,Consumer
...,...,...
2173,Corporate,Consumer
2060,Consumer,Home Office
3679,Corporate,Consumer
4232,Consumer,Corporate


In [19]:
conds = [
    (y_pred_df_str["y_pred"] == "Consumer") & (y_pred_df_str["y_true"] == "Corporate"),
    (y_pred_df_str["y_pred"] == "Consumer") & (y_pred_df_str["y_true"] == "Home Office"),
    (y_pred_df_str["y_pred"] == "Home Office") & (y_pred_df_str["y_true"] == "Corporate"),
    (y_pred_df_str["y_pred"] == "Home Office") & (y_pred_df_str["y_true"] == "Consumer"),
    (y_pred_df_str["y_pred"] == "Corporate") & (y_pred_df_str["y_true"] == "Consumer"),
    (y_pred_df_str["y_pred"] == "Corporate") & (y_pred_df_str["y_true"] == "Home Office"),
    y_pred_df_str["y_pred"] == y_pred_df_str["y_true"]
]

choices = [
    "Cons/Corp",
    "Cons/HO",
    "HO/Corp",
    "HO/Cons",
    "Corp/Cons",
    "Corp/HO",
    "Correct"
]

In [20]:
y_pred_df_str["pred_true"] = np.select(condlist=conds, choicelist=choices)

In [22]:
y_pred_df_str.pred_true.value_counts()

Correct      618
Cons/Corp    258
Cons/HO      192
Corp/Cons     71
Corp/HO       25
HO/Cons       21
HO/Corp       15
Name: pred_true, dtype: int64