In [1]:
# import dependencies
from sklearn.model_selection import train_test_split
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix, balanced_accuracy_score
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
# read in our cleaned, merged data
ML_df = pd.read_csv('../resources/superstore_topcountries.csv', index_col=[0])
ML_df

Unnamed: 0_level_0,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,Category,...,Quantity,Discount,Profit,Shipping Cost,Order Priority,Returned,Profit Margin Percentage,Returned $ Amount,Days to Ship,Orders per Country
Order ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA-2014-AB10015140-41954,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,...,2,0.0,62.15,40.770,High,0.0,27.998018,0.0,2,9994
IN-2014-JR162107-41675,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,...,9,0.1,-288.77,923.630,Critical,0.0,-7.784817,0.0,2,2837
IN-2014-CR127307-41929,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,...,9,0.1,919.97,915.490,Medium,0.0,17.776614,0.0,1,2837
ES-2014-KM1637548-41667,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,Technology,...,5,0.1,-96.54,910.160,Medium,0.0,-3.337586,0.0,2,2063
IN-2014-JM156557-41818,2014-06-28,2014-07-01,Second Class,Corporate,Sydney,New South Wales,Australia,Oceania,Asia Pacific,Technology,...,5,0.1,763.28,897.350,Critical,0.0,26.663127,0.0,3,2837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MX-2013-KM1637593-41636,2013-12-28,2013-12-31,First Class,Home Office,Managua,Managua,Nicaragua,Central America,LATAM,Office Supplies,...,1,0.0,8.00,1.010,Medium,0.0,42.918455,0.0,3,614
US-2014-ZD21925140-41765,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,Furniture,...,5,0.2,3.34,1.930,High,0.0,19.976077,0.0,4,9994
CA-2012-ZD21925140-41147,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,Office Supplies,...,2,0.0,2.48,1.580,High,0.0,28.971963,0.0,5,9994
MX-2013-RB1979518-41322,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,Office Supplies,...,2,0.0,2.40,1.003,Medium,0.0,17.857143,0.0,4,1593


# Preprocessing Data

In [3]:
# Order & Ship Date to datetime
ML_df['Order Date'] = pd.to_datetime(ML_df['Order Date'])
ML_df['Ship Date'] = pd.to_datetime(ML_df['Ship Date'])

# reset index & drop Order ID
ML_df = ML_df.reset_index()
ML_df = ML_df.drop(columns='Order ID')
ML_df

Unnamed: 0,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,Category,...,Quantity,Discount,Profit,Shipping Cost,Order Priority,Returned,Profit Margin Percentage,Returned $ Amount,Days to Ship,Orders per Country
0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,Technology,...,2,0.0,62.15,40.770,High,0.0,27.998018,0.0,2,9994
1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,Furniture,...,9,0.1,-288.77,923.630,Critical,0.0,-7.784817,0.0,2,2837
2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,Technology,...,9,0.1,919.97,915.490,Medium,0.0,17.776614,0.0,1,2837
3,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,Technology,...,5,0.1,-96.54,910.160,Medium,0.0,-3.337586,0.0,2,2063
4,2014-06-28,2014-07-01,Second Class,Corporate,Sydney,New South Wales,Australia,Oceania,Asia Pacific,Technology,...,5,0.1,763.28,897.350,Critical,0.0,26.663127,0.0,3,2837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38609,2013-12-28,2013-12-31,First Class,Home Office,Managua,Managua,Nicaragua,Central America,LATAM,Office Supplies,...,1,0.0,8.00,1.010,Medium,0.0,42.918455,0.0,3,614
38610,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,Furniture,...,5,0.2,3.34,1.930,High,0.0,19.976077,0.0,4,9994
38611,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,Office Supplies,...,2,0.0,2.48,1.580,High,0.0,28.971963,0.0,5,9994
38612,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,Office Supplies,...,2,0.0,2.40,1.003,Medium,0.0,17.857143,0.0,4,1593


### Day of Week

In [4]:
# get day of week for order date
# runs 0 - 6; 0=Monday, 6=Sunday
ML_df['Order Day'] = ML_df['Order Date'].dt.dayofweek
ML_df['Order Day'].head()

0    1
1    2
2    4
3    1
4    5
Name: Order Day, dtype: int64

In [5]:
# move Order Day to separate df, will encode
day_dummies = pd.get_dummies(ML_df['Order Day']).reset_index()
day_dummies

Unnamed: 0,index,0,1,2,3,4,5,6
0,0,0,1,0,0,0,0,0
1,1,0,0,1,0,0,0,0
2,2,0,0,0,0,1,0,0
3,3,0,1,0,0,0,0,0
4,4,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...
38609,38609,0,0,0,0,0,1,0
38610,38610,0,1,0,0,0,0,0
38611,38611,0,0,0,0,0,0,1
38612,38612,0,0,0,0,0,0,1


In [6]:
# rename 0-6 index to days of week
day_dummies = day_dummies.drop(columns='index')
day_dummies.columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_dummies

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...
38609,0,0,0,0,0,1,0
38610,0,1,0,0,0,0,0
38611,0,0,0,0,0,0,1
38612,0,0,0,0,0,0,1


In [7]:
# concat the encoded days of week to end of the ML_df
ML_df = ML_df.reset_index()
ML_df = pd.concat([ML_df, day_dummies], axis=1, join='inner')
ML_df

Unnamed: 0,index,Order Date,Ship Date,Ship Mode,Segment,City,State,Country,Region,Market,...,Days to Ship,Orders per Country,Order Day,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,0,2014-11-11,2014-11-13,First Class,Consumer,Oklahoma City,Oklahoma,United States,Central US,USCA,...,2,9994,1,0,1,0,0,0,0,0
1,1,2014-02-05,2014-02-07,Second Class,Corporate,Wollongong,New South Wales,Australia,Oceania,Asia Pacific,...,2,2837,2,0,0,1,0,0,0,0
2,2,2014-10-17,2014-10-18,First Class,Consumer,Brisbane,Queensland,Australia,Oceania,Asia Pacific,...,1,2837,4,0,0,0,0,1,0,0
3,3,2014-01-28,2014-01-30,First Class,Home Office,Berlin,Berlin,Germany,Western Europe,Europe,...,2,2063,1,0,1,0,0,0,0,0
4,4,2014-06-28,2014-07-01,Second Class,Corporate,Sydney,New South Wales,Australia,Oceania,Asia Pacific,...,3,2837,5,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38609,38609,2013-12-28,2013-12-31,First Class,Home Office,Managua,Managua,Nicaragua,Central America,LATAM,...,3,614,5,0,0,0,0,0,1,0
38610,38610,2014-05-06,2014-05-10,Standard Class,Consumer,Chattanooga,Tennessee,United States,Southern US,USCA,...,4,9994,1,0,1,0,0,0,0,0
38611,38611,2012-08-26,2012-08-31,Second Class,Consumer,San Francisco,California,United States,Western US,USCA,...,5,9994,6,0,0,0,0,0,0,1
38612,38612,2013-02-17,2013-02-21,Standard Class,Home Office,Valinhos,S�o Paulo,Brazil,South America,LATAM,...,4,1593,6,0,0,0,0,0,0,1


In [8]:
ML_df.columns

Index(['index', 'Order Date', 'Ship Date', 'Ship Mode', 'Segment', 'City',
       'State', 'Country', 'Region', 'Market', 'Category', 'Sub-Category',
       'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost',
       'Order Priority', 'Returned', 'Profit Margin Percentage',
       'Returned $ Amount', 'Days to Ship', 'Orders per Country', 'Order Day',
       'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday'],
      dtype='object')

### Cleaning Up Columns

In [9]:
# pull year & month from the Order Date
ML_df['Order Year'] = ML_df['Order Date'].dt.year
ML_df['Order Month'] = ML_df['Order Date'].dt.month

# drop columns
# Order Day - encoded to separate columns
# Order Date - pulled to year & month
# Ship Date - redundant; order info & days to ship
# City & State & Country - too many unqiue values
# Market - redunant with Region (which is more specific)
ML_df = ML_df.drop(columns=['Order Day', 'Order Date', 'Ship Date', 'City', 'State', 'Country','Market', 'index'])
ML_df.head()

Unnamed: 0,Ship Mode,Segment,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit,Shipping Cost,...,Orders per Country,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Order Year,Order Month
0,First Class,Consumer,Central US,Technology,Phones,221.98,2,0.0,62.15,40.77,...,9994,0,1,0,0,0,0,0,2014,11
1,Second Class,Corporate,Oceania,Furniture,Chairs,3709.4,9,0.1,-288.77,923.63,...,2837,0,0,1,0,0,0,0,2014,2
2,First Class,Consumer,Oceania,Technology,Phones,5175.17,9,0.1,919.97,915.49,...,2837,0,0,0,0,1,0,0,2014,10
3,First Class,Home Office,Western Europe,Technology,Phones,2892.51,5,0.1,-96.54,910.16,...,2063,0,1,0,0,0,0,0,2014,1
4,Second Class,Corporate,Oceania,Technology,Phones,2862.68,5,0.1,763.28,897.35,...,2837,0,0,0,0,0,1,0,2014,6


In [10]:
# encode Ship Mode, Segment, Region, Sub/Category, Order Priority
ML_df = pd.get_dummies(ML_df, columns=['Ship Mode', 'Segment', 'Region', 'Category', 'Sub-Category', 'Order Priority'])
ML_df.head()

Unnamed: 0,Sales,Quantity,Discount,Profit,Shipping Cost,Returned,Profit Margin Percentage,Returned $ Amount,Days to Ship,Orders per Country,...,Sub-Category_Machines,Sub-Category_Paper,Sub-Category_Phones,Sub-Category_Storage,Sub-Category_Supplies,Sub-Category_Tables,Order Priority_Critical,Order Priority_High,Order Priority_Low,Order Priority_Medium
0,221.98,2,0.0,62.15,40.77,0.0,27.998018,0.0,2,9994,...,0,0,1,0,0,0,0,1,0,0
1,3709.4,9,0.1,-288.77,923.63,0.0,-7.784817,0.0,2,2837,...,0,0,0,0,0,0,1,0,0,0
2,5175.17,9,0.1,919.97,915.49,0.0,17.776614,0.0,1,2837,...,0,0,1,0,0,0,0,0,0,1
3,2892.51,5,0.1,-96.54,910.16,0.0,-3.337586,0.0,2,2063,...,0,0,1,0,0,0,0,0,0,1
4,2862.68,5,0.1,763.28,897.35,0.0,26.663127,0.0,3,2837,...,0,0,1,0,0,0,1,0,0,0


In [11]:
# save the ML prepped ML_df to a csv
ML_df.to_csv('../resources/superstore_topcountries_ML_prepped.csv')

# Linear Regression

### Sales

In [12]:
# set target (sales) and features
y = ML_df['Sales']
X = ML_df.drop(columns='Sales')

# get columns to create coefficient df
sales_columns = X.columns
sales_columns = pd.Series(sales_columns)

In [13]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(28960, 65)

In [14]:
# create a LinearRegression instance
model = LinearRegression()

In [15]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [16]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [17]:
# examine the coefficients and the intercept - create series for both
sales_coeffs = model.coef_
sales_coeffs = pd.Series(sales_coeffs)

sales_intercept = model.intercept_

In [18]:
# create df with variables and its coefficient
data = {"Variables" : sales_columns,
       "Coefficients" : sales_coeffs}
sales_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
sales_coeff_df

Unnamed: 0,Variables,Coefficients
0,Quantity,26.683742
1,Discount,-90.011842
2,Profit,0.862344
3,Shipping Cost,4.812867
4,Returned,-75.542698
5,Profit Margin Percentage,-1.162526
6,Returned $ Amount,0.282491
7,Days to Ship,-1.462338
8,Orders per Country,0.002888
9,Monday,-6.098812


In [19]:
print(f' The intercept for this model is {sales_intercept}')

 The intercept for this model is 2246.925019531091


In [20]:
# calculate the r-squared
r2_lin_sales = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for sales is {r2_lin_sales}')

The r2 score for the linear regression model for sales is 0.5924407645396963


### Profit

In [21]:
# set target (profit) and features
y = ML_df['Profit']
X = ML_df.drop(columns='Profit')

# get columns to create coefficient df
profit_columns = X.columns
profit_columns = pd.Series(profit_columns)

In [22]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(28960, 65)

In [23]:
# create a LinearRegression instance
model = LinearRegression()

In [24]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [25]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [26]:
# examine the coefficients and the intercept - create series for both
profit_coeffs = model.coef_
profit_coeffs = pd.Series(profit_coeffs)

profit_intercept = model.intercept_

In [27]:
# create df with variables and its coefficient
data = {"Variables" : profit_columns,
       "Coefficients" : profit_coeffs}
profit_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
profit_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,0.220769
1,Quantity,-5.648641
2,Discount,27.673175
3,Shipping Cost,-0.201359
4,Returned,-14.684709
5,Profit Margin Percentage,1.478203
6,Returned $ Amount,0.07249
7,Days to Ship,0.028648
8,Orders per Country,-0.002139
9,Monday,2.154314


In [28]:
print(f' The intercept for this model is {profit_intercept}')

 The intercept for this model is -633.2909506756657


In [29]:
# calculate the r-squared
r2_lin_profit = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for profit is {r2_lin_profit}')

The r2 score for the linear regression model for profit is 0.2677333770611521


### Shipping Cost

In [30]:
# set target (shipping cost) and features
y = ML_df['Shipping Cost']
X = ML_df.drop(columns='Shipping Cost')

# get columns to create coefficient df
shipping_columns = X.columns
shipping_columns = pd.Series(shipping_columns)

In [31]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
X_train.shape

(28960, 65)

In [32]:
# create a LinearRegression instance
model = LinearRegression()

In [33]:
# fit the model to the train set
model.fit(X_train, y_train)

LinearRegression()

In [34]:
# have the model predict the y values
y_pred = model.predict(X_test)

In [35]:
# examine the coefficients and the intercept - create series for both
shipping_coeffs = model.coef_
shipping_coeffs = pd.Series(shipping_coeffs)

shipping_intercept = model.intercept_

In [36]:
# create df with variables and its coefficient
data = {"Variables" : shipping_columns,
       "Coefficients" : shipping_coeffs}
shipping_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
shipping_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,0.088741
1,Quantity,0.991345
2,Discount,-1.118024
3,Profit,-0.014502
4,Returned,2.924273
5,Profit Margin Percentage,0.034504
6,Returned $ Amount,-0.010351
7,Days to Ship,0.02848
8,Orders per Country,-0.000276
9,Monday,0.00626


In [37]:
print(f' The intercept for this model is {shipping_intercept}')

 The intercept for this model is -225.09026361775778


In [38]:
# calculate the r-squared
r2_lin_shipping = r2_score(y_test, y_pred)
print(f'The r2 score for the linear regression model for shipping costs is {r2_lin_shipping}')

The r2 score for the linear regression model for shipping costs is 0.5390339577721024


# Logistic Regression

### Returned?

In [39]:
# set target (Returned) and features
y = ML_df['Returned']
X = ML_df.drop(columns=['Returned', 'Returned $ Amount'])

# get columns to create coefficient df
returned_columns = X.columns
returned_columns = pd.Series(returned_columns)

In [40]:
# split into train & test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
                                                    #stratify=y)
returned_split = Counter(y_train)
returned_split

Counter({0.0: 27635, 1.0: 1325})

In [41]:
# create the log regression model 
model = LogisticRegression()

In [42]:
# Train the data
model.fit(X_train, y_train)

LogisticRegression()

In [43]:
# Predict outcomes for test data set
y_pred = model.predict(X_test)
pred_actual = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [44]:
# examine the coefficients and the intercept - create series for both
returned_coeffs = model.coef_
returned_coeffs = pd.Series(returned_coeffs.flatten())

returned_intercept = model.intercept_

In [45]:
# create df with variables and its coefficient
data = {"Variables" : returned_columns,
       "Coefficients" : returned_coeffs}
returned_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
returned_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,-0.000144
1,Quantity,0.00183
2,Discount,0.001516
3,Profit,0.000237
4,Shipping Cost,0.000447
5,Profit Margin Percentage,0.000409
6,Days to Ship,0.00826
7,Orders per Country,3e-06
8,Monday,-0.000285
9,Tuesday,-0.001119


In [46]:
print(f' The intercept for this model is {returned_intercept}')

 The intercept for this model is [1.91091416e-06]


In [47]:
confusion_matrix(y_test, y_pred)

array([[9241,    0],
       [ 413,    0]], dtype=int64)

In [48]:
acc_score = accuracy_score(y_test, y_pred)
print(f'The accuracy score for the un-altered logistic regression is {acc_score}')

The accuracy score for the un-altered logistic regression is 0.9572198052620675


### Random Over Sampling

In [49]:
# randomly oversample the minority (returned) class
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X_train, y_train)
Counter(y_ros)

Counter({0.0: 27635, 1.0: 27635})

In [50]:
# initiate new model & fit to the oversampled train data
model= LogisticRegression()
model.fit(X_ros, y_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [51]:
# examine the coefficients and the intercept - create series for both
returned_ros_coeffs = model.coef_
returned_ros_coeffs = pd.Series(returned_ros_coeffs.flatten())

returned_ros_intercept = model.intercept_

In [52]:
# create df with variables and its coefficient
data = {"Variables" : returned_columns,
       "Coefficients" : returned_ros_coeffs}
returned_ros_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
returned_ros_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,-0.000136
1,Quantity,0.003981
2,Discount,0.001264
3,Profit,0.000253
4,Shipping Cost,0.000384
5,Profit Margin Percentage,0.000235
6,Days to Ship,0.007038
7,Orders per Country,4e-06
8,Monday,-0.000334
9,Tuesday,-0.001842


In [53]:
print(f' The intercept for this model is {returned_ros_intercept}')

 The intercept for this model is [2.90555822e-06]


In [54]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4480, 4761],
       [ 186,  227]], dtype=int64)

In [55]:
# balanced accuracy score
ros_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the randomly over sampled logistic regression is {ros_acc_score}')

The balanced accuracy score for the randomly over sampled logistic regression is 0.5172164108105446


### Random Under Sampling

In [56]:
# random under sample the train data sets
rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_resample(X_train, y_train)
Counter(y_rus)

Counter({0.0: 1325, 1.0: 1325})

In [57]:
# initiate new model & fit to the randomly oversampled train data
model = LogisticRegression()
model.fit(X_rus, y_rus)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [58]:
# examine the coefficients and the intercept - create series for both
returned_rus_coeffs = model.coef_
returned_rus_coeffs = pd.Series(returned_rus_coeffs.flatten())

returned_rus_intercept = model.intercept_

In [59]:
# create df with variables and its coefficient
data = {"Variables" : returned_columns,
       "Coefficients" : returned_rus_coeffs}
returned_rus_coeff_df = pd.concat(data, axis=1)

# force python to show max rows
pd.set_option('display.max_rows', None)
returned_rus_coeff_df

Unnamed: 0,Variables,Coefficients
0,Sales,-0.000165
1,Quantity,0.021468
2,Discount,0.025763
3,Profit,2.1e-05
4,Shipping Cost,0.001099
5,Profit Margin Percentage,0.000127
6,Days to Ship,0.042406
7,Orders per Country,3e-06
8,Monday,-0.013113
9,Tuesday,-0.014344


In [60]:
print(f' The intercept for this model is {returned_rus_intercept}')

 The intercept for this model is [4.81984633e-05]


In [61]:
# create predictions and confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4596, 4645],
       [ 203,  210]], dtype=int64)

In [62]:
# balanced accuracy score
rus_acc_score = balanced_accuracy_score(y_test, y_pred)
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')

The balanced accuracy score for the randomly under sampled logistic regression is 0.5029116740245663


# Results - Top Countries|

In [63]:
print(f'The r2 score for the linear regression model for sales is {r2_lin_sales}')
print(f'The r2 score for the linear regression model for profit is {r2_lin_profit}')
print(f'The r2 score for the linear regression model for shipping costs is {r2_lin_shipping}\n')
print(f'The accuracy score for the un-altered logistic regression is {acc_score}')
print(f'The split between not returned (0) and returned (1) for the un-altered regression is {returned_split}\n')
print(f'The balanced accuracy score for the randomly over sampled logistic regression is {ros_acc_score}')
print(f'The balanced accuracy score for the randomly under sampled logistic regression is {rus_acc_score}')

The r2 score for the linear regression model for sales is 0.5924407645396963
The r2 score for the linear regression model for profit is 0.2677333770611521
The r2 score for the linear regression model for shipping costs is 0.5390339577721024

The accuracy score for the un-altered logistic regression is 0.9572198052620675
The split between not returned (0) and returned (1) for the un-altered regression is Counter({0.0: 27635, 1.0: 1325})

The balanced accuracy score for the randomly over sampled logistic regression is 0.5172164108105446
The balanced accuracy score for the randomly under sampled logistic regression is 0.5029116740245663
