In [1]:
# import dependencies
from sklearn.model_selection import train_test_split
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, balanced_accuracy_score
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# read in our cleaned, merged data
ML_df = pd.read_csv('resources/superstore.csv', index_col=[0])
ML_df

Unnamed: 0_level_0,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,Category,...,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Days to Ship,Returned,Profit Margin Percentage,Returned $ Amount
Order ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA-2014-AB10015140-41954,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,...,221.98,2,0.0,62.15,40.770,High,2,0.0,27.998018,0.0
IN-2014-JR162107-41675,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,...,3709.40,9,0.1,-288.77,923.630,Critical,2,0.0,-7.784817,0.0
IN-2014-CR127307-41929,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,...,5175.17,9,0.1,919.97,915.490,Medium,1,0.0,17.776614,0.0
ES-2014-KM1637548-41667,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,Technology,...,2892.51,5,0.1,-96.54,910.160,Medium,2,0.0,-3.337586,0.0
SG-2014-RH9495111-41948,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,Technology,...,2832.96,8,0.0,311.52,903.040,Critical,1,0.0,10.996272,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IN-2015-KE1642066-42174,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,Office Supplies,...,65.10,5,0.0,4.50,1.010,Medium,0,0.0,6.912442,0.0
US-2014-ZD21925140-41765,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,Furniture,...,16.72,5,0.2,3.34,1.930,High,4,0.0,19.976077,0.0
CA-2012-ZD21925140-41147,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,Office Supplies,...,8.56,2,0.0,2.48,1.580,High,5,0.0,28.971963,0.0
MX-2013-RB1979518-41322,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,Office Supplies,...,13.44,2,0.0,2.40,1.003,Medium,4,0.0,17.857143,0.0


# Preprocessing Data

In [3]:
# Order & Ship Date to datetime
ML_df['Order Date'] = pd.to_datetime(ML_df['Order Date'])
ML_df['Ship Date'] = pd.to_datetime(ML_df['Ship Date'])

# reset index & drop Order ID
ML_df = ML_df.reset_index()
ML_df = ML_df.drop(columns='Order ID')
ML_df

Unnamed: 0,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,Category,...,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Days to Ship,Returned,Profit Margin Percentage,Returned $ Amount
0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,...,221.98,2,0.0,62.15,40.770,High,2,0.0,27.998018,0.0
1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,...,3709.40,9,0.1,-288.77,923.630,Critical,2,0.0,-7.784817,0.0
2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,...,5175.17,9,0.1,919.97,915.490,Medium,1,0.0,17.776614,0.0
3,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,Technology,...,2892.51,5,0.1,-96.54,910.160,Medium,2,0.0,-3.337586,0.0
4,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,Technology,...,2832.96,8,0.0,311.52,903.040,Critical,1,0.0,10.996272,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,Office Supplies,...,65.10,5,0.0,4.50,1.010,Medium,0,0.0,6.912442,0.0
51286,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,Furniture,...,16.72,5,0.2,3.34,1.930,High,4,0.0,19.976077,0.0
51287,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,Office Supplies,...,8.56,2,0.0,2.48,1.580,High,5,0.0,28.971963,0.0
51288,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,Office Supplies,...,13.44,2,0.0,2.40,1.003,Medium,4,0.0,17.857143,0.0


### Day of Week

In [4]:
# get day of week for order date
# runs 0 - 6; 0=Monday, 6=Sunday
ML_df['Order Day'] = ML_df['Order Date'].dt.dayofweek
ML_df['Order Day'].head()

0    1
1    2
2    4
3    1
4    2
Name: Order Day, dtype: int64

In [5]:
# move Order Day to separate df, will encode
day_dummies = pd.get_dummies(ML_df['Order Day']).reset_index()
day_dummies

Unnamed: 0,index,0,1,2,3,4,5,6
0,0,0,1,0,0,0,0,0
1,1,0,0,1,0,0,0,0
2,2,0,0,0,0,1,0,0
3,3,0,1,0,0,0,0,0
4,4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
51285,51285,0,0,0,0,1,0,0
51286,51286,0,1,0,0,0,0,0
51287,51287,0,0,0,0,0,0,1
51288,51288,0,0,0,0,0,0,1


In [6]:
# rename 0-6 index to days of week
day_dummies = day_dummies.drop(columns='index')
day_dummies.columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_dummies

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
51285,0,0,0,0,1,0,0
51286,0,1,0,0,0,0,0
51287,0,0,0,0,0,0,1
51288,0,0,0,0,0,0,1


In [7]:
# concat the encoded days of week to end of the ML_df
ML_df = ML_df.reset_index()
ML_df = pd.concat([ML_df, day_dummies], axis=1, join='inner')
ML_df

Unnamed: 0,index,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,...,Profit Margin Percentage,Returned $ Amount,Order Day,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,...,27.998018,0.0,1,0,1,0,0,0,0,0
1,1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,...,-7.784817,0.0,2,0,0,1,0,0,0,0
2,2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,...,17.776614,0.0,4,0,0,0,0,1,0,0
3,3,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,...,-3.337586,0.0,1,0,1,0,0,0,0,0
4,4,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,...,10.996272,0.0,2,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,51285,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,...,6.912442,0.0,4,0,0,0,0,1,0,0
51286,51286,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,...,19.976077,0.0,1,0,1,0,0,0,0,0
51287,51287,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,...,28.971963,0.0,6,0,0,0,0,0,0,1
51288,51288,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,...,17.857143,0.0,6,0,0,0,0,0,0,1


In [8]:
ML_df.columns

Index(['index', 'Order Date', 'Ship Date', 'Ship Mode', 'Segment', 'City',
       'State', 'Country', 'Region', 'Market', 'Category', 'Sub-Category',
       'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost',
       'Order Priority', 'Days to Ship', 'Returned',
       'Profit Margin Percentage', 'Returned $ Amount', 'Order Day', 'Monday',
       'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
      dtype='object')

### Cleaning Up Columns

In [9]:
# pull year & month from the Order Date
ML_df['Order Year'] = ML_df['Order Date'].dt.year
ML_df['Order Month'] = ML_df['Order Date'].dt.month

# drop columns
# Order Day - encoded to separate columns
# Order Date - pulled to year & month
# Ship Date - redundant; order info & days to ship
# City & State & Country - too many unqiue values
# Market - redunant with Region (which is more specific)
ML_df = ML_df.drop(columns=['Order Day', 'Order Date', 'Ship Date', 'City', 'State', 'Country','Market', 'index'])
ML_df.head()

Unnamed: 0,Ship Mode,Segment,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit,Shipping Cost,...,Returned $ Amount,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Order Year,Order Month
0,First Class,Consumer,Central US,Technology,Phones,221.98,2,0.0,62.15,40.77,...,0.0,0,1,0,0,0,0,0,2014,11
1,Second Class,Corporate,Oceania,Furniture,Chairs,3709.4,9,0.1,-288.77,923.63,...,0.0,0,0,1,0,0,0,0,2014,2
2,First Class,Consumer,Oceania,Technology,Phones,5175.17,9,0.1,919.97,915.49,...,0.0,0,0,0,0,1,0,0,2014,10
3,First Class,Home Office,Western Europe,Technology,Phones,2892.51,5,0.1,-96.54,910.16,...,0.0,0,1,0,0,0,0,0,2014,1
4,Same Day,Consumer,Western Africa,Technology,Copiers,2832.96,8,0.0,311.52,903.04,...,0.0,0,0,1,0,0,0,0,2014,11


In [10]:
ML_df.dtypes

Ship Mode                    object
Segment                      object
Region                       object
Category                     object
Sub-Category                 object
Sales                       float64
Quantity                      int64
Discount                    float64
Profit                      float64
Shipping Cost               float64
Order Priority               object
Days to Ship                  int64
Returned                    float64
Profit Margin Percentage    float64
Returned $ Amount           float64
Monday                        uint8
Tuesday                       uint8
Wednesday                     uint8
Thursday                      uint8
Friday                        uint8
Saturday                      uint8
Sunday                        uint8
Order Year                    int64
Order Month                   int64
dtype: object

In [11]:
# encode Ship Mode, Segment, Region, Sub/Category, Order Priority
ML_df = pd.get_dummies(ML_df, columns=['Ship Mode', 'Segment', 'Region', 'Category', 'Sub-Category', 'Order Priority'])
ML_df.head()

Unnamed: 0,Sales,Quantity,Discount,Profit,Shipping Cost,Days to Ship,Returned,Profit Margin Percentage,Returned $ Amount,Monday,...,Sub-Category_Machines,Sub-Category_Paper,Sub-Category_Phones,Sub-Category_Storage,Sub-Category_Supplies,Sub-Category_Tables,Order Priority_Critical,Order Priority_High,Order Priority_Low,Order Priority_Medium
0,221.98,2,0.0,62.15,40.77,2,0.0,27.998018,0.0,0,...,0,0,1,0,0,0,0,1,0,0
1,3709.4,9,0.1,-288.77,923.63,2,0.0,-7.784817,0.0,0,...,0,0,0,0,0,0,1,0,0,0
2,5175.17,9,0.1,919.97,915.49,1,0.0,17.776614,0.0,0,...,0,0,1,0,0,0,0,0,0,1
3,2892.51,5,0.1,-96.54,910.16,2,0.0,-3.337586,0.0,0,...,0,0,1,0,0,0,0,0,0,1
4,2832.96,8,0.0,311.52,903.04,1,0.0,10.996272,0.0,0,...,0,0,0,0,0,0,1,0,0,0


In [12]:
# save the ML prepped ML_df to a csv
ML_df.to_csv('resources/superstore_ML_prepped.csv')

# Linear Regression

### Sales

In [13]:
# set target (sales) and features
y = ML_df['Sales']
X = ML_df.drop(columns='Sales')

# get columns to create coefficient df
sales_columns = X.columns
sales_columns = pd.Series(sales_columns)

In [14]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(38467, 71)

In [15]:
# create a LinearRegression instance
model = LinearRegression()

In [16]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [17]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [18]:
# examine the coefficients and the intercept - create series for both
sales_coeffs = model.coef_
sales_coeffs = pd.Series(sales_coeffs)

sales_intercept = model.intercept_

In [19]:
# create df with variables and its coefficient
data = {"Variables" : sales_columns,
       "Coefficients" : sales_coeffs}
sales_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
sales_coeff_df

Unnamed: 0,Variables,Coefficients
0,Quantity,26.16884
1,Discount,-79.01391
2,Profit,0.6417371
3,Shipping Cost,5.059992
4,Days to Ship,0.1263632
5,Returned,-75.40734
6,Profit Margin Percentage,-0.7859801
7,Returned $ Amount,0.2753601
8,Monday,314796300000.0
9,Tuesday,314796300000.0


In [20]:
print(f' The intercept for this model is {sales_intercept}')

 The intercept for this model is -840135528473.2665


In [21]:
# calculate the r-squared
r2_lin_sales = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for sales is {r2_lin_sales}')

The r2 score for the linear regression model for sales is 0.7038318382230757


### Profit

In [22]:
# set target (profit) and features
y = ML_df['Profit']
X = ML_df.drop(columns='Profit')

# get columns to create coefficient df
profit_columns = X.columns
profit_columns = pd.Series(profit_columns)

In [23]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(38467, 71)

In [24]:
# create a LinearRegression instance
model = LinearRegression()

In [25]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [26]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [27]:
# examine the coefficients and the intercept - create series for both
profit_coeffs = model.coef_
profit_coeffs = pd.Series(profit_coeffs)

profit_intercept = model.intercept_

In [28]:
# create df with variables and its coefficient
data = {"Variables" : profit_columns,
       "Coefficients" : profit_coeffs}
profit_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
profit_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,0.164637
1,Quantity,-4.436588
2,Discount,28.282442
3,Shipping Cost,0.14174
4,Days to Ship,0.794036
5,Returned,-17.12381
6,Profit Margin Percentage,1.391108
7,Returned $ Amount,0.07742
8,Monday,0.811461
9,Tuesday,-1.174814


In [29]:
print(f' The intercept for this model is {profit_intercept}')

 The intercept for this model is -398.86571665462026


In [30]:
# calculate the r-squared
r2_lin_profit = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for profit is {r2_lin_profit}')

The r2 score for the linear regression model for profit is 0.36553572728733164


### Shipping Cost

In [31]:
# set target (shipping cost) and features
y = ML_df['Shipping Cost']
X = ML_df.drop(columns='Shipping Cost')

# get columns to create coefficient df
shipping_columns = X.columns
shipping_columns = pd.Series(shipping_columns)

In [32]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(38467, 71)

In [33]:
# create a LinearRegression instance
model = LinearRegression()

In [34]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [35]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [36]:
# examine the coefficients and the intercept - create series for both
shipping_coeffs = model.coef_
shipping_coeffs = pd.Series(shipping_coeffs)

shipping_intercept = model.intercept_

In [37]:
# create df with variables and its coefficient
data = {"Variables" : shipping_columns,
       "Coefficients" : shipping_coeffs}
shipping_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
shipping_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,0.083824
1,Quantity,1.262951
2,Discount,-3.323946
3,Profit,0.009152
4,Days to Ship,-0.327681
5,Returned,1.95588
6,Profit Margin Percentage,-0.003952
7,Returned $ Amount,-0.002139
8,Monday,-0.18425
9,Tuesday,-0.173443


In [38]:
print(f' The intercept for this model is {shipping_intercept}')

 The intercept for this model is -40.857013728766596


In [39]:
# calculate the r-squared
r2_lin_shipping = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for shipping costs is {r2_lin_shipping}')

The r2 score for the linear regression model for shipping costs is 0.6029476282063644


# Logistic Regression

### Returned?

In [40]:
# set target (Returned) and features
y = ML_df['Returned']
X = ML_df.drop(columns=['Returned', 'Returned $ Amount'])

# get columns to create coefficient df
returned_columns = X.columns
returned_columns = pd.Series(returned_columns)

In [41]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
returned_split = Counter(y_train)
returned_split

Counter({0.0: 36805, 1.0: 1662})

In [42]:
# create the log regression model 
model = LogisticRegression()

In [43]:
# Train the data
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [44]:
# Predict outcomes for test data set
y_pred = model.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
14799,0.0,0.0
13808,0.0,0.0
1113,0.0,0.0
25534,0.0,0.0
8472,0.0,0.0
33714,0.0,0.0
11647,0.0,0.0
49786,0.0,0.0
9252,0.0,0.0
38299,0.0,0.0


In [45]:
print(model.coef_)

[[-3.08869779e-04  5.77077195e-03  6.32508956e-03  5.80527516e-05
   1.97403024e-03  4.30684348e-02  4.92005584e-04  3.83052436e-03
   2.98698947e-03  1.30856987e-02 -7.58814123e-03 -8.44046137e-03
  -1.01461145e-02  6.27908961e-03 -1.66823736e-03  9.16538968e-03
  -3.74952867e-03  8.23766904e-04 -2.62711437e-02  2.92044904e-02
   2.36465340e-02 -8.24630413e-03 -1.53926449e-02 -1.27562848e-03
  -4.00413282e-03 -6.18383672e-03  6.43129847e-03  5.58407305e-04
  -1.26996641e-02 -6.25041355e-03  1.81847274e-02 -1.10458770e-02
   1.88298938e-03 -1.14923186e-03 -1.35371708e-02  2.74266199e-04
   2.30817395e-03  4.56489818e-03  1.90305899e-03  1.58239491e-03
   1.11393164e-02  3.81767869e-03 -1.10401790e-03  2.40038951e-03
  -1.12446597e-02  1.34546185e-02 -5.78369115e-03 -4.30113131e-03
   1.00924074e-02  4.88131079e-03 -6.02061537e-03  5.54876825e-03
  -1.33044567e-03 -1.26812612e-04 -4.66358856e-03  3.97969376e-03
  -5.91144057e-04 -4.74745355e-03 -3.45066102e-03  1.06317139e-02
   2.60175

In [46]:
# examine the coefficients and the intercept - create series for both
returned_coeffs = model.coef_
returned_coeffs = pd.Series(returned_coeffs.flatten())

returned_intercept = model.intercept_

In [47]:
# create df with variables and its coefficient
data = {"Variables" : returned_columns,
       "Coefficients" : returned_coeffs}
returned_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
returned_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,-0.000309
1,Quantity,0.005771
2,Discount,0.006325
3,Profit,5.8e-05
4,Shipping Cost,0.001974
5,Days to Ship,0.043068
6,Profit Margin Percentage,0.000492
7,Monday,0.003831
8,Tuesday,0.002987
9,Wednesday,0.013086


In [48]:
print(f' The intercept for this model is {returned_intercept}')

 The intercept for this model is [7.58493859e-06]


In [49]:
confusion_matrix(y_test, y_pred)

array([[12265,     0],
       [  558,     0]], dtype=int64)

In [50]:
acc_score = accuracy_score(y_test, y_pred)
print(f'The accuracy score for the un-altered logistic regression is {acc_score}')

The accuracy score for the un-altered logistic regression is 0.9564844420182484


### Random Under Sampling

In [51]:
# random under sample the train data sets
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train, y_train)
Counter(y_rus)

Counter({0.0: 1662, 1.0: 1662})

In [52]:
# initiate new model & fit to the randomly oversampled train data
model = LogisticRegression()
model.fit(X_rus, y_rus)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [53]:
# examine the coefficients and the intercept - create series for both
returned2_coeffs = model.coef_
returned2_coeffs = pd.Series(returned2_coeffs.flatten())
print(returned2_coeffs)

returned2_intercept = model.intercept_
#sales_intercept = pd.Series(sales_intercept)
print(returned2_intercept)

0    -0.000308
1     0.037007
2     0.008770
3     0.000249
4     0.001834
5     0.020165
6    -0.000287
7     0.004694
8    -0.004746
9     0.017439
10   -0.014571
11   -0.007865
12   -0.011728
13    0.016798
14   -0.000101
15    0.000366
16   -0.005280
17    0.007168
18   -0.021448
19    0.019580
20    0.012780
21   -0.009802
22   -0.002958
23   -0.000922
24   -0.006935
25   -0.005666
26   -0.001632
27    0.001588
28   -0.010619
29   -0.009752
30    0.019635
31   -0.017046
32    0.001291
33    0.002769
34   -0.005718
35    0.006022
36    0.003892
37    0.003647
38   -0.000074
39    0.000053
40    0.012901
41    0.004595
42   -0.003246
43    0.000910
44   -0.010124
45    0.014450
46   -0.008639
47   -0.001443
48    0.010102
49   -0.001070
50   -0.001977
51    0.006234
52   -0.004599
53    0.003250
54   -0.004118
55    0.000551
56   -0.003336
57   -0.001802
58   -0.008571
59    0.008429
60    0.001844
61   -0.006006
62    0.008777
63    0.000203
64    0.001411
65    0.000800
66   -0.00

In [54]:
# create df with variables and its coefficient
data = {"Variables" : returned_columns,
       "Coefficients" : returned2_coeffs}
returned2_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
returned2_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,-0.000308
1,Quantity,0.037007
2,Discount,0.00877
3,Profit,0.000249
4,Shipping Cost,0.001834
5,Days to Ship,0.020165
6,Profit Margin Percentage,-0.000287
7,Monday,0.004694
8,Tuesday,-0.004746
9,Wednesday,0.017439


In [55]:
print(f' The intercept for this model is {returned2_intercept}')

 The intercept for this model is [1.99951889e-05]


In [56]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6877, 5388],
       [ 295,  263]], dtype=int64)

In [57]:
# balanced accuracy score
rus_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')

The balanced accuracy score for the randomly under sampled logistic regression is 0.5160136735501989


# Results

In [58]:
print(f'The r2 score for the linear regression model for sales is {r2_lin_sales}')
print(f'The r2 score for the linear regression model for profit is {r2_lin_profit}')
print(f'The r2 score for the linear regression model for shipping costs is {r2_lin_shipping}\n')
print(f'The accuracy score for the un-altered logistic regression is {acc_score}')
print(f'The split between not returned (0) and returned (1) for the un-altered regression is {returned_split}\n')
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')

The r2 score for the linear regression model for sales is 0.7038318382230757
The r2 score for the linear regression model for profit is 0.36553572728733164
The r2 score for the linear regression model for shipping costs is 0.6029476282063644

The accuracy score for the un-altered logistic regression is 0.9564844420182484
The split between not returned (0) and returned (1) for the un-altered regression is Counter({0.0: 36805, 1.0: 1662})

The balanced accuracy score for the randomly under sampled logistic regression is 0.5160136735501989
