In [1]:
# import dependencies
from sklearn.model_selection import train_test_split
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, balanced_accuracy_score
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# read in our cleaned, merged data
ML_df = pd.read_csv('resources/superstore.csv', index_col=[0])
ML_df

Unnamed: 0_level_0,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,Category,...,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Days to Ship,Returned,Profit Margin Percentage,Returned $ Amount
Order ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA-2014-AB10015140-41954,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,...,221.98,2,0.0,62.15,40.770,High,2,0.0,27.998018,0.0
IN-2014-JR162107-41675,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,...,3709.40,9,0.1,-288.77,923.630,Critical,2,0.0,-7.784817,0.0
IN-2014-CR127307-41929,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,...,5175.17,9,0.1,919.97,915.490,Medium,1,0.0,17.776614,0.0
ES-2014-KM1637548-41667,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,Technology,...,2892.51,5,0.1,-96.54,910.160,Medium,2,0.0,-3.337586,0.0
SG-2014-RH9495111-41948,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,Technology,...,2832.96,8,0.0,311.52,903.040,Critical,1,0.0,10.996272,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IN-2015-KE1642066-42174,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,Office Supplies,...,65.10,5,0.0,4.50,1.010,Medium,0,0.0,6.912442,0.0
US-2014-ZD21925140-41765,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,Furniture,...,16.72,5,0.2,3.34,1.930,High,4,0.0,19.976077,0.0
CA-2012-ZD21925140-41147,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,Office Supplies,...,8.56,2,0.0,2.48,1.580,High,5,0.0,28.971963,0.0
MX-2013-RB1979518-41322,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,Office Supplies,...,13.44,2,0.0,2.40,1.003,Medium,4,0.0,17.857143,0.0


# Preprocessing Data

In [3]:
# Order & Ship Date to datetime
ML_df['Order Date'] = pd.to_datetime(ML_df['Order Date'])
ML_df['Ship Date'] = pd.to_datetime(ML_df['Ship Date'])

# reset index & drop Order ID
ML_df = ML_df.reset_index()
ML_df = ML_df.drop(columns='Order ID')
ML_df

Unnamed: 0,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,Category,...,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Days to Ship,Returned,Profit Margin Percentage,Returned $ Amount
0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,...,221.98,2,0.0,62.15,40.770,High,2,0.0,27.998018,0.0
1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,...,3709.40,9,0.1,-288.77,923.630,Critical,2,0.0,-7.784817,0.0
2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,...,5175.17,9,0.1,919.97,915.490,Medium,1,0.0,17.776614,0.0
3,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,Technology,...,2892.51,5,0.1,-96.54,910.160,Medium,2,0.0,-3.337586,0.0
4,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,Technology,...,2832.96,8,0.0,311.52,903.040,Critical,1,0.0,10.996272,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,Office Supplies,...,65.10,5,0.0,4.50,1.010,Medium,0,0.0,6.912442,0.0
51286,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,Furniture,...,16.72,5,0.2,3.34,1.930,High,4,0.0,19.976077,0.0
51287,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,Office Supplies,...,8.56,2,0.0,2.48,1.580,High,5,0.0,28.971963,0.0
51288,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,Office Supplies,...,13.44,2,0.0,2.40,1.003,Medium,4,0.0,17.857143,0.0


### Day of Week

In [4]:
# get day of week for order date
# runs 0 - 6; 0=Monday, 6=Sunday
ML_df['Order Day'] = ML_df['Order Date'].dt.dayofweek
ML_df['Order Day'].head()

0    1
1    2
2    4
3    1
4    2
Name: Order Day, dtype: int64

In [5]:
# move Order Day to separate df, will encode
day_dummies = pd.get_dummies(ML_df['Order Day']).reset_index()
day_dummies

Unnamed: 0,index,0,1,2,3,4,5,6
0,0,0,1,0,0,0,0,0
1,1,0,0,1,0,0,0,0
2,2,0,0,0,0,1,0,0
3,3,0,1,0,0,0,0,0
4,4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
51285,51285,0,0,0,0,1,0,0
51286,51286,0,1,0,0,0,0,0
51287,51287,0,0,0,0,0,0,1
51288,51288,0,0,0,0,0,0,1


In [6]:
# rename 0-6 index to days of week
day_dummies = day_dummies.drop(columns='index')
day_dummies.columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_dummies

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
51285,0,0,0,0,1,0,0
51286,0,1,0,0,0,0,0
51287,0,0,0,0,0,0,1
51288,0,0,0,0,0,0,1


In [7]:
# concat the encoded days of week to end of the ML_df
ML_df = ML_df.reset_index()
ML_df = pd.concat([ML_df, day_dummies], axis=1, join='inner')
ML_df

Unnamed: 0,index,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,...,Profit Margin Percentage,Returned $ Amount,Order Day,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,...,27.998018,0.0,1,0,1,0,0,0,0,0
1,1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,...,-7.784817,0.0,2,0,0,1,0,0,0,0
2,2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,...,17.776614,0.0,4,0,0,0,0,1,0,0
3,3,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,...,-3.337586,0.0,1,0,1,0,0,0,0,0
4,4,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,...,10.996272,0.0,2,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,51285,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,...,6.912442,0.0,4,0,0,0,0,1,0,0
51286,51286,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,...,19.976077,0.0,1,0,1,0,0,0,0,0
51287,51287,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,...,28.971963,0.0,6,0,0,0,0,0,0,1
51288,51288,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,...,17.857143,0.0,6,0,0,0,0,0,0,1


In [8]:
ML_df.columns

Index(['index', 'Order Date', 'Ship Date', 'Ship Mode', 'Segment', 'City',
       'State', 'Country', 'Region', 'Market', 'Category', 'Sub-Category',
       'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost',
       'Order Priority', 'Days to Ship', 'Returned',
       'Profit Margin Percentage', 'Returned $ Amount', 'Order Day', 'Monday',
       'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
      dtype='object')

### Cleaning Up Columns

In [9]:
# pull year & month from the Order Date
ML_df['Order Year'] = ML_df['Order Date'].dt.year
ML_df['Order Month'] = ML_df['Order Date'].dt.month

# drop columns
# Order Day - encoded to separate columns
# Order Date - pulled to year & month
# Ship Date - redundant; order info & days to ship
# City & State & Country - too many unqiue values
# Market - redunant with Region (which is more specific)
ML_df = ML_df.drop(columns=['Order Day', 'Order Date', 'Ship Date', 'City', 'State', 'Country','Market', 'index'])
ML_df.head()

Unnamed: 0,Ship Mode,Segment,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit,Shipping Cost,...,Returned $ Amount,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Order Year,Order Month
0,First Class,Consumer,Central US,Technology,Phones,221.98,2,0.0,62.15,40.77,...,0.0,0,1,0,0,0,0,0,2014,11
1,Second Class,Corporate,Oceania,Furniture,Chairs,3709.4,9,0.1,-288.77,923.63,...,0.0,0,0,1,0,0,0,0,2014,2
2,First Class,Consumer,Oceania,Technology,Phones,5175.17,9,0.1,919.97,915.49,...,0.0,0,0,0,0,1,0,0,2014,10
3,First Class,Home Office,Western Europe,Technology,Phones,2892.51,5,0.1,-96.54,910.16,...,0.0,0,1,0,0,0,0,0,2014,1
4,Same Day,Consumer,Western Africa,Technology,Copiers,2832.96,8,0.0,311.52,903.04,...,0.0,0,0,1,0,0,0,0,2014,11


In [10]:
ML_df.dtypes

Ship Mode                    object
Segment                      object
Region                       object
Category                     object
Sub-Category                 object
Sales                       float64
Quantity                      int64
Discount                    float64
Profit                      float64
Shipping Cost               float64
Order Priority               object
Days to Ship                  int64
Returned                    float64
Profit Margin Percentage    float64
Returned $ Amount           float64
Monday                        uint8
Tuesday                       uint8
Wednesday                     uint8
Thursday                      uint8
Friday                        uint8
Saturday                      uint8
Sunday                        uint8
Order Year                    int64
Order Month                   int64
dtype: object

In [11]:
# encode Ship Mode, Segment, Region, Sub/Category, Order Priority
ML_df = pd.get_dummies(ML_df, columns=['Ship Mode', 'Segment', 'Region', 'Category', 'Sub-Category', 'Order Priority'])
ML_df.head()

Unnamed: 0,Sales,Quantity,Discount,Profit,Shipping Cost,Days to Ship,Returned,Profit Margin Percentage,Returned $ Amount,Monday,...,Sub-Category_Machines,Sub-Category_Paper,Sub-Category_Phones,Sub-Category_Storage,Sub-Category_Supplies,Sub-Category_Tables,Order Priority_Critical,Order Priority_High,Order Priority_Low,Order Priority_Medium
0,221.98,2,0.0,62.15,40.77,2,0.0,27.998018,0.0,0,...,0,0,1,0,0,0,0,1,0,0
1,3709.4,9,0.1,-288.77,923.63,2,0.0,-7.784817,0.0,0,...,0,0,0,0,0,0,1,0,0,0
2,5175.17,9,0.1,919.97,915.49,1,0.0,17.776614,0.0,0,...,0,0,1,0,0,0,0,0,0,1
3,2892.51,5,0.1,-96.54,910.16,2,0.0,-3.337586,0.0,0,...,0,0,1,0,0,0,0,0,0,1
4,2832.96,8,0.0,311.52,903.04,1,0.0,10.996272,0.0,0,...,0,0,0,0,0,0,1,0,0,0


In [12]:
# save the ML prepped ML_df to a csv
ML_df.to_csv('resources/superstore_ML_prepped.csv')

# Linear Regression

### Sales

In [13]:
# set target (sales) and features
y = ML_df['Sales']
X = ML_df.drop(columns='Sales')

In [14]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(38467, 71)

In [15]:
ML_df.dtypes

Sales                      float64
Quantity                     int64
Discount                   float64
Profit                     float64
Shipping Cost              float64
                            ...   
Sub-Category_Tables          uint8
Order Priority_Critical      uint8
Order Priority_High          uint8
Order Priority_Low           uint8
Order Priority_Medium        uint8
Length: 72, dtype: object

In [16]:
# create a LinearRegression instance
model = LinearRegression()

In [17]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [18]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [19]:
# examine the coefficients and the intercept
print(model.coef_)
print(model.intercept_)

[ 2.61688413e+01 -7.90139097e+01  6.41737124e-01  5.05999230e+00
  1.26363162e-01 -7.54073427e+01 -7.85980120e-01  2.75360111e-01
  3.14796307e+11  3.14796307e+11  3.14796307e+11  3.14796307e+11
  3.14796307e+11  3.14796307e+11  3.14796307e+11 -7.56500244e-01
 -4.76068497e-01 -1.36482560e+11 -1.36482560e+11 -1.36482560e+11
 -1.36482560e+11 -1.88588670e+10 -1.88588670e+10 -1.88588670e+10
  1.63175700e+11  1.63175700e+11  1.63175700e+11  1.63175700e+11
  1.63175700e+11  1.63175700e+11  1.63175700e+11  1.63175700e+11
  1.63175700e+11  1.63175700e+11  1.63175700e+11  1.63175700e+11
  1.63175700e+11  1.63175700e+11  1.63175700e+11  1.63175700e+11
  1.63175700e+11  1.63175700e+11  1.63175700e+11  1.63175700e+11
  1.63175700e+11  1.63175700e+11  1.63175700e+11  7.43084387e+11
  4.18197779e+11  4.77404049e+10  3.79843384e+11  9.38600929e+09
  9.38600912e+09  9.38600911e+09 -3.15500598e+11 -3.15500599e+11
  3.79843384e+11  9.38600912e+09  9.38600910e+09 -3.15500599e+11
  9.38600910e+09  3.79843

In [20]:
# calculate the r-squared
r2_lin_sales = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for sales is {r2_lin_sales}')

The r2 score for the linear regression model for sales is 0.7038318382230757


### Profit

In [21]:
# set target (profit) and features
y = ML_df['Profit']
X = ML_df.drop(columns='Profit')

In [22]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(38467, 71)

In [23]:
# create a LinearRegression instance
model = LinearRegression()

In [24]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [25]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [26]:
# examine the coefficients and the intercept
print(model.coef_)
print(model.intercept_)

[ 1.64636749e-01 -4.43658818e+00  2.82824419e+01  1.41739652e-01
  7.94036257e-01 -1.71238103e+01  1.39110765e+00  7.74202455e-02
  8.11461300e-01 -1.17481405e+00 -1.92658314e+00 -5.41892856e-01
  2.28276908e+00  3.50510258e-01  1.98549408e-01  1.85473278e-01
 -1.20614684e-01  1.65257596e-01  1.10797663e+00 -1.27560229e+00
  2.36806092e-03 -4.46446731e-01  2.00580585e-01  2.45866146e-01
 -3.68616502e+00 -6.49365199e+00 -4.10607650e+00 -6.81714929e+00
  5.82054654e+01  4.83509786e+00  1.28827349e+01 -6.51553890e-02
 -4.77126275e+00 -1.42454244e+01 -7.51132141e+00 -6.28148423e-01
 -1.71034214e+01 -3.02365136e+00 -1.74997268e+01 -9.35204776e+00
 -9.50925335e-01 -1.71009999e+01 -1.40068431e+01  6.19648485e+01
  2.00836233e+01 -1.53076133e+01 -1.53021859e+01 -2.95705591e+01
  2.12068971e+01  8.36366203e+00  1.66270443e+01 -8.58527756e+00
  5.12421626e+00  1.36437754e+01  2.06133978e+01  2.70202021e+01
  1.18906899e+01  4.77112722e+00  5.63417145e+00  5.48738660e+01
  3.36614762e+00 -2.04273

In [27]:
# calculate the r-squared
r2_lin_profit = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for profit is {r2_lin_profit}')

The r2 score for the linear regression model for profit is 0.36553572728733164


# Logistic Regression

### Returned?

In [28]:
# set target (Returned) and features
y = ML_df['Returned']
X = ML_df.drop(columns=['Returned', 'Returned $ Amount'])

In [29]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
Counter(y_train)

Counter({0.0: 36805, 1.0: 1662})

In [30]:
# create the log regression model 
classifier = LogisticRegression()
classifier

LogisticRegression()

In [31]:
# Train the data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [32]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
14799,0.0,0.0
13808,0.0,0.0
1113,0.0,0.0
25534,0.0,0.0
8472,0.0,0.0
...,...,...
15959,0.0,0.0
21653,0.0,0.0
27177,0.0,0.0
21388,0.0,0.0


In [33]:
confusion_matrix(y_test, predictions)

array([[12265,     0],
       [  558,     0]], dtype=int64)

In [34]:
acc_score = accuracy_score(y_test, predictions)
print(f'The accuracy score for the un-altered logistic regression is {acc_score}')

The accuracy score for the un-altered logistic regression is 0.9564844420182484


### Random Under Sampling

In [36]:
# random under sample the train data sets
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train, y_train)
Counter(y_rus)

Counter({0.0: 1662, 1.0: 1662})

In [37]:
# initiate new model & fit to the randomly oversampled train data
model= LogisticRegression()
model.fit(X_rus, y_rus)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [38]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6258, 6007],
       [ 255,  303]], dtype=int64)

In [39]:
# balanced accuracy score
rus_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')

The balanced accuracy score for the randomly under sampled logistic regression is 0.5266215606082523
