In [1]:
# import dependencies
from sklearn.model_selection import train_test_split
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, balanced_accuracy_score
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# read in our cleaned, merged data
ML_df = pd.read_csv('../resources/superstore.csv', index_col=[0])
ML_df

Unnamed: 0_level_0,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,Category,...,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Returned,Profit Margin Percentage,Returned $ Amount,Days to Ship
Order ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA-2014-AB10015140-41954,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,...,221.98,2,0.0,62.15,40.770,High,0.0,27.998018,0.0,2
IN-2014-JR162107-41675,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,...,3709.40,9,0.1,-288.77,923.630,Critical,0.0,-7.784817,0.0,2
IN-2014-CR127307-41929,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,...,5175.17,9,0.1,919.97,915.490,Medium,0.0,17.776614,0.0,1
ES-2014-KM1637548-41667,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,Technology,...,2892.51,5,0.1,-96.54,910.160,Medium,0.0,-3.337586,0.0,2
SG-2014-RH9495111-41948,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,Technology,...,2832.96,8,0.0,311.52,903.040,Critical,0.0,10.996272,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
IN-2015-KE1642066-42174,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,Office Supplies,...,65.10,5,0.0,4.50,1.010,Medium,0.0,6.912442,0.0,0
US-2014-ZD21925140-41765,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,Furniture,...,16.72,5,0.2,3.34,1.930,High,0.0,19.976077,0.0,4
CA-2012-ZD21925140-41147,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,Office Supplies,...,8.56,2,0.0,2.48,1.580,High,0.0,28.971963,0.0,5
MX-2013-RB1979518-41322,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,Office Supplies,...,13.44,2,0.0,2.40,1.003,Medium,0.0,17.857143,0.0,4


# Preprocessing Data

In [3]:
# Order & Ship Date to datetime
ML_df['Order Date'] = pd.to_datetime(ML_df['Order Date'])
ML_df['Ship Date'] = pd.to_datetime(ML_df['Ship Date'])

# reset index & drop Order ID
ML_df = ML_df.reset_index()
ML_df = ML_df.drop(columns='Order ID')
ML_df

Unnamed: 0,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,Category,...,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority,Returned,Profit Margin Percentage,Returned $ Amount,Days to Ship
0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,...,221.98,2,0.0,62.15,40.770,High,0.0,27.998018,0.0,2
1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,...,3709.40,9,0.1,-288.77,923.630,Critical,0.0,-7.784817,0.0,2
2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,...,5175.17,9,0.1,919.97,915.490,Medium,0.0,17.776614,0.0,1
3,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,Technology,...,2892.51,5,0.1,-96.54,910.160,Medium,0.0,-3.337586,0.0,2
4,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,Technology,...,2832.96,8,0.0,311.52,903.040,Critical,0.0,10.996272,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,Office Supplies,...,65.10,5,0.0,4.50,1.010,Medium,0.0,6.912442,0.0,0
51286,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,Furniture,...,16.72,5,0.2,3.34,1.930,High,0.0,19.976077,0.0,4
51287,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,Office Supplies,...,8.56,2,0.0,2.48,1.580,High,0.0,28.971963,0.0,5
51288,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,Office Supplies,...,13.44,2,0.0,2.40,1.003,Medium,0.0,17.857143,0.0,4


### Day of Week

In [4]:
# get day of week for order date
# runs 0 - 6; 0=Monday, 6=Sunday
ML_df['Order Day'] = ML_df['Order Date'].dt.dayofweek
ML_df['Order Day'].head()

0    1
1    2
2    4
3    1
4    2
Name: Order Day, dtype: int64

In [5]:
# move Order Day to separate df, will encode
day_dummies = pd.get_dummies(ML_df['Order Day']).reset_index()
day_dummies

Unnamed: 0,index,0,1,2,3,4,5,6
0,0,0,1,0,0,0,0,0
1,1,0,0,1,0,0,0,0
2,2,0,0,0,0,1,0,0
3,3,0,1,0,0,0,0,0
4,4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
51285,51285,0,0,0,0,1,0,0
51286,51286,0,1,0,0,0,0,0
51287,51287,0,0,0,0,0,0,1
51288,51288,0,0,0,0,0,0,1


In [6]:
# rename 0-6 index to days of week
day_dummies = day_dummies.drop(columns='index')
day_dummies.columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_dummies

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
51285,0,0,0,0,1,0,0
51286,0,1,0,0,0,0,0
51287,0,0,0,0,0,0,1
51288,0,0,0,0,0,0,1


In [7]:
# concat the encoded days of week to end of the ML_df
ML_df = ML_df.reset_index()
ML_df = pd.concat([ML_df, day_dummies], axis=1, join='inner')
ML_df

Unnamed: 0,index,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,...,Returned $ Amount,Days to Ship,Order Day,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,...,0.0,2,1,0,1,0,0,0,0,0
1,1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,...,0.0,2,2,0,0,1,0,0,0,0
2,2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,...,0.0,1,4,0,0,0,0,1,0,0
3,3,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,...,0.0,2,1,0,1,0,0,0,0,0
4,4,2014-11-05,2014-11-06,Same Day,Consumer,Dakar,Dakar,Senegal,Western Africa,Africa,...,0.0,1,2,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,51285,2015-06-19,2015-06-19,Same Day,Corporate,Kure,Hiroshima,Japan,Eastern Asia,Asia Pacific,...,0.0,0,4,0,0,0,0,1,0,0
51286,51286,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,...,0.0,4,1,0,1,0,0,0,0,0
51287,51287,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,...,0.0,5,6,0,0,0,0,0,0,1
51288,51288,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,...,0.0,4,6,0,0,0,0,0,0,1


In [8]:
ML_df.columns

Index(['index', 'Order Date', 'Ship Date', 'Ship Mode', 'Segment', 'City',
       'State', 'Country', 'Region', 'Market', 'Category', 'Sub-Category',
       'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost',
       'Order Priority', 'Returned', 'Profit Margin Percentage',
       'Returned $ Amount', 'Days to Ship', 'Order Day', 'Monday', 'Tuesday',
       'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
      dtype='object')

### Cleaning Up Columns

In [9]:
# pull year & month from the Order Date
ML_df['Order Year'] = ML_df['Order Date'].dt.year
ML_df['Order Month'] = ML_df['Order Date'].dt.month

# drop columns
# Order Day - encoded to separate columns
# Order Date - pulled to year & month
# Ship Date - redundant; order info & days to ship
# City & State & Country - too many unqiue values
# Market - redunant with Region (which is more specific)
ML_df = ML_df.drop(columns=['Order Day', 'Order Date', 'Ship Date', 'City', 'State', 'Country','Market', 'index'])
ML_df.head()

Unnamed: 0,Ship Mode,Segment,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit,Shipping Cost,...,Days to Ship,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Order Year,Order Month
0,First Class,Consumer,Central US,Technology,Phones,221.98,2,0.0,62.15,40.77,...,2,0,1,0,0,0,0,0,2014,11
1,Second Class,Corporate,Oceania,Furniture,Chairs,3709.4,9,0.1,-288.77,923.63,...,2,0,0,1,0,0,0,0,2014,2
2,First Class,Consumer,Oceania,Technology,Phones,5175.17,9,0.1,919.97,915.49,...,1,0,0,0,0,1,0,0,2014,10
3,First Class,Home Office,Western Europe,Technology,Phones,2892.51,5,0.1,-96.54,910.16,...,2,0,1,0,0,0,0,0,2014,1
4,Same Day,Consumer,Western Africa,Technology,Copiers,2832.96,8,0.0,311.52,903.04,...,1,0,0,1,0,0,0,0,2014,11


In [10]:
# encode Ship Mode, Segment, Region, Sub/Category, Order Priority
ML_df = pd.get_dummies(ML_df, columns=['Ship Mode', 'Segment', 'Region', 'Category', 'Sub-Category', 'Order Priority'])
ML_df.head()

Unnamed: 0,Sales,Quantity,Discount,Profit,Shipping Cost,Returned,Profit Margin Percentage,Returned $ Amount,Days to Ship,Monday,...,Sub-Category_Machines,Sub-Category_Paper,Sub-Category_Phones,Sub-Category_Storage,Sub-Category_Supplies,Sub-Category_Tables,Order Priority_Critical,Order Priority_High,Order Priority_Low,Order Priority_Medium
0,221.98,2,0.0,62.15,40.77,0.0,27.998018,0.0,2,0,...,0,0,1,0,0,0,0,1,0,0
1,3709.4,9,0.1,-288.77,923.63,0.0,-7.784817,0.0,2,0,...,0,0,0,0,0,0,1,0,0,0
2,5175.17,9,0.1,919.97,915.49,0.0,17.776614,0.0,1,0,...,0,0,1,0,0,0,0,0,0,1
3,2892.51,5,0.1,-96.54,910.16,0.0,-3.337586,0.0,2,0,...,0,0,1,0,0,0,0,0,0,1
4,2832.96,8,0.0,311.52,903.04,0.0,10.996272,0.0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [11]:
# save the ML prepped ML_df to a csv
ML_df.to_csv('../resources/superstore_ML_prepped.csv')

# Linear Regression

### Sales

In [12]:
# set target (sales) and features
y = ML_df['Sales']
X = ML_df.drop(columns='Sales')

# get columns to create coefficient df
sales_columns = X.columns
sales_columns = pd.Series(sales_columns)

In [13]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(38467, 71)

In [14]:
# create a LinearRegression instance
model = LinearRegression()

In [15]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [16]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [17]:
# examine the coefficients and the intercept - create series for both
sales_coeffs = model.coef_
sales_coeffs = pd.Series(sales_coeffs)

sales_intercept = model.intercept_

In [18]:
# create df with variables and its coefficient
data = {"Variables" : sales_columns,
       "Coefficients" : sales_coeffs}
sales_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
sales_coeff_df

Unnamed: 0,Variables,Coefficients
0,Quantity,26.16921
1,Discount,-79.01536
2,Profit,0.6417341
3,Shipping Cost,5.059989
4,Returned,-75.40537
5,Profit Margin Percentage,-0.7859676
6,Returned $ Amount,0.2753622
7,Days to Ship,0.1264029
8,Monday,-238127900000.0
9,Tuesday,-238127900000.0


In [19]:
print(f' The intercept for this model is {sales_intercept}')

 The intercept for this model is 1657811840257.3862


In [20]:
# calculate the r-squared
r2_lin_sales = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for sales is {r2_lin_sales}')

The r2 score for the linear regression model for sales is 0.7038316921464297


### Profit

In [21]:
# set target (profit) and features
y = ML_df['Profit']
X = ML_df.drop(columns='Profit')

# get columns to create coefficient df
profit_columns = X.columns
profit_columns = pd.Series(profit_columns)

In [22]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(38467, 71)

In [23]:
# create a LinearRegression instance
model = LinearRegression()

In [24]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [25]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [26]:
# examine the coefficients and the intercept - create series for both
profit_coeffs = model.coef_
profit_coeffs = pd.Series(profit_coeffs)

profit_intercept = model.intercept_

In [27]:
# create df with variables and its coefficient
data = {"Variables" : profit_columns,
       "Coefficients" : profit_coeffs}
profit_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
profit_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,0.1646364
1,Quantity,-4.436546
2,Discount,28.28323
3,Shipping Cost,0.1417407
4,Returned,-17.12442
5,Profit Margin Percentage,1.391111
6,Returned $ Amount,0.07742088
7,Days to Ship,0.794041
8,Monday,19092200000.0
9,Tuesday,19092200000.0


In [28]:
print(f' The intercept for this model is {profit_intercept}')

 The intercept for this model is 239698565265.81296


In [29]:
# calculate the r-squared
r2_lin_profit = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for profit is {r2_lin_profit}')

The r2 score for the linear regression model for profit is 0.3655354942459712


### Shipping Cost

In [30]:
# set target (shipping cost) and features
y = ML_df['Shipping Cost']
X = ML_df.drop(columns='Shipping Cost')

# get columns to create coefficient df
shipping_columns = X.columns
shipping_columns = pd.Series(shipping_columns)

In [31]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(38467, 71)

In [32]:
# create a LinearRegression instance
model = LinearRegression()

In [33]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [34]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [35]:
# examine the coefficients and the intercept - create series for both
shipping_coeffs = model.coef_
shipping_coeffs = pd.Series(shipping_coeffs)

shipping_intercept = model.intercept_

In [36]:
# create df with variables and its coefficient
data = {"Variables" : shipping_columns,
       "Coefficients" : shipping_coeffs}
shipping_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
shipping_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,0.083824
1,Quantity,1.262951
2,Discount,-3.323946
3,Profit,0.009152
4,Returned,1.95588
5,Profit Margin Percentage,-0.003952
6,Returned $ Amount,-0.002139
7,Days to Ship,-0.327681
8,Monday,-0.18425
9,Tuesday,-0.173443


In [37]:
print(f' The intercept for this model is {shipping_intercept}')

 The intercept for this model is -40.85701372876658


In [38]:
# calculate the r-squared
r2_lin_shipping = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for shipping costs is {r2_lin_shipping}')

The r2 score for the linear regression model for shipping costs is 0.6029476282063628


# Logistic Regression

### Returned?

In [39]:
# set target (Returned) and features
y = ML_df['Returned']
X = ML_df.drop(columns=['Returned', 'Returned $ Amount'])

# get columns to create coefficient df
returned_columns = X.columns
returned_columns = pd.Series(returned_columns)

In [40]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
returned_split = Counter(y_train)
returned_split

Counter({0.0: 36805, 1.0: 1662})

In [41]:
# create the log regression model 
model = LogisticRegression()

In [42]:
# Train the data
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [43]:
# Predict outcomes for test data set
y_pred = model.predict(X_test)
pred_actual = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [44]:
# examine the coefficients and the intercept - create series for both
returned_coeffs = model.coef_
returned_coeffs = pd.Series(returned_coeffs.flatten())

returned_intercept = model.intercept_

In [45]:
# create df with variables and its coefficient
data = {"Variables" : returned_columns,
       "Coefficients" : returned_coeffs}
returned_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
returned_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,-0.000309
1,Quantity,0.005771
2,Discount,0.006325
3,Profit,5.8e-05
4,Shipping Cost,0.001974
5,Profit Margin Percentage,0.000492
6,Days to Ship,0.043067
7,Monday,0.00383
8,Tuesday,0.002987
9,Wednesday,0.013085


In [46]:
print(f' The intercept for this model is {returned_intercept}')

 The intercept for this model is [7.58464319e-06]


In [47]:
confusion_matrix(y_test, y_pred)

array([[12265,     0],
       [  558,     0]], dtype=int64)

In [48]:
acc_score = accuracy_score(y_test, y_pred)
print(f'The accuracy score for the un-altered logistic regression is {acc_score}')

The accuracy score for the un-altered logistic regression is 0.9564844420182484


### Random Over Sampling

In [49]:
# randomly oversample the minority (returned) class
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X_train, y_train)
Counter(y_ros)

Counter({0.0: 36805, 1.0: 36805})

In [50]:
# initiate new model & fit to the oversampled train data
model= LogisticRegression()
model.fit(X_ros, y_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [51]:
# examine the coefficients and the intercept - create series for both
returned_ros_coeffs = model.coef_
returned_ros_coeffs = pd.Series(returned_ros_coeffs.flatten())

returned_ros_intercept = model.intercept_

In [52]:
# create df with variables and its coefficient
data = {"Variables" : returned_columns,
       "Coefficients" : returned_ros_coeffs}
returned_ros_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
returned_ros_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,-0.000292
1,Quantity,-0.006059
2,Discount,0.035267
3,Profit,4.2e-05
4,Shipping Cost,0.002369
5,Profit Margin Percentage,0.001254
6,Days to Ship,0.016129
7,Monday,0.016277
8,Tuesday,0.027528
9,Wednesday,0.071716


In [53]:
print(f' The intercept for this model is {returned_ros_intercept}')

 The intercept for this model is [6.33623598e-05]


In [54]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6615, 5650],
       [ 272,  286]], dtype=int64)

In [55]:
# balanced accuracy score
ros_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the randomly over sampled logistic regression is {ros_acc_score}')

The balanced accuracy score for the randomly over sampled logistic regression is 0.5259421935250085


### Random Under Sampling

In [56]:
# random under sample the train data sets
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train, y_train)
Counter(y_rus)

Counter({0.0: 1662, 1.0: 1662})

In [57]:
# initiate new model & fit to the randomly oversampled train data
model = LogisticRegression()
model.fit(X_rus, y_rus)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [58]:
# examine the coefficients and the intercept - create series for both
returned_rus_coeffs = model.coef_
returned_rus_coeffs = pd.Series(returned_rus_coeffs.flatten())

returned_rus_intercept = model.intercept_

In [59]:
# create df with variables and its coefficient
data = {"Variables" : returned_columns,
       "Coefficients" : returned_rus_coeffs}
returned_rus_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
returned_rus_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,-0.000201
1,Quantity,-0.000597
2,Discount,0.026104
3,Profit,9.5e-05
4,Shipping Cost,0.001581
5,Profit Margin Percentage,0.000535
6,Days to Ship,-0.044129
7,Monday,0.021219
8,Tuesday,-0.034155
9,Wednesday,0.02706


In [60]:
print(f' The intercept for this model is {returned_rus_intercept}')

 The intercept for this model is [-2.10706858e-05]


In [61]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[6509, 5756],
       [ 316,  242]], dtype=int64)

In [62]:
# balanced accuracy score
rus_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')

The balanced accuracy score for the randomly under sampled logistic regression is 0.4821944309286997


# Results

In [64]:
print(f'The r2 score for the linear regression model for sales is {r2_lin_sales}')
print(f'The r2 score for the linear regression model for profit is {r2_lin_profit}')
print(f'The r2 score for the linear regression model for shipping costs is {r2_lin_shipping}\n')
print(f'The accuracy score for the un-altered logistic regression is {acc_score}')
print(f'The split between not returned (0) and returned (1) for the un-altered regression is {returned_split}\n')
print(f'The balanced accuracy score for the randomly over sampled logistic regression is {ros_acc_score}')
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')

The r2 score for the linear regression model for sales is 0.7038316921464297
The r2 score for the linear regression model for profit is 0.3655354942459712
The r2 score for the linear regression model for shipping costs is 0.6029476282063628

The accuracy score for the un-altered logistic regression is 0.9564844420182484
The split between not returned (0) and returned (1) for the un-altered regression is Counter({0.0: 36805, 1.0: 1662})

The balanced accuracy score for the randomly over sampled logistic regression is 0.5259421935250085
The balanced accuracy score for the randomly under sampled logistic regression is 0.4821944309286997
