In [60]:
# Run in Python 3.7 kernel
# Import libraries

import pandas as pd
import numpy as np # creating and manipulating arrays
import matplotlib.pyplot as plt # visualizing data
import sklearn # regression models
import statsmodels.api as sm 

1. Develop Linear Regression model using Statmodels

In [61]:
data = pd.read_csv("garments_worker_productivity.csv", header = 0) # read in data
data = pd.get_dummies(data, columns=["quarter","department","day","team"], drop_first=True) # create dummy variables for categorical data
data = data.dropna() # drop rows with NaN values
data = data.drop('date', axis=1) # drop dates
data_ols = sm.add_constant(data)
model = sm.OLS(endog= data_ols.actual_productivity, exog = data_ols.loc[:, data_ols.columns!='actual_productivity']) # use actual productivity as the dependent variable, and everything else as independent
results = model.fit()
print(results.summary())

# Compile Significant Variables
sigparams = results.params[results.pvalues < 0.05] # Get significant variables
sigparams = sigparams.to_frame() # Create a dataframe of them
sigparams.columns = ['coefficient'] # Name the columns
sigparams = sigparams['coefficient'].astype(float) # Convert to float so that you can sort it by absolute value
sigparams = sigparams.sort_values(key=abs, ascending=False) # Order by absolute value

data_new = np.array([[0],[1],[2],[3],[4],[5],[6]]) #you'll use this later

                             OLS Regression Results                            
Dep. Variable:     actual_productivity   R-squared:                       0.813
Model:                             OLS   Adj. R-squared:                  0.805
Method:                  Least Squares   F-statistic:                     99.37
Date:                 Tue, 13 Sep 2022   Prob (F-statistic):          3.90e-219
Time:                         23:36:44   Log-Likelihood:                 889.27
No. Observations:                  691   AIC:                            -1719.
Df Residuals:                      661   BIC:                            -1582.
Df Model:                           29                                         
Covariance Type:             nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
targeted_productivit

2. Select the best features using forward selection

In [62]:
# Show significant parameters
print(sigparams)

targeted_productivity    0.621713
department_sweing        0.145683
team_11                 -0.043256
idle_men                -0.006253
smv                     -0.003984
incentive                0.003253
no_of_workers            0.002039
idle_time                0.000614
over_time               -0.000003
Name: coefficient, dtype: float64


Test Models

In [63]:
il = [] #create blank list to assign features to
for x in range(0, (sigparams.count())):
    if x == 0:
        model = sm.OLS(endog= data_ols.actual_productivity, exog = data_ols.loc[:, data_ols.columns==sigparams.index[0]])
        results = model.fit()
        print(il)
        print(results.rsquared_adj)
        il.append(sigparams.index[0])
    else:
        il.append(sigparams.index[x])
        model = sm.OLS(endog= data_ols.actual_productivity, exog = data_ols[il])
        results = model.fit()
        print(il)
        print(results.rsquared_adj)




[]
0.9773934809944065
['targeted_productivity', 'department_sweing']
0.4864174278778667
['targeted_productivity', 'department_sweing', 'team_11']
0.4859171427117741
['targeted_productivity', 'department_sweing', 'team_11', 'idle_men']
0.5326273408778037
['targeted_productivity', 'department_sweing', 'team_11', 'idle_men', 'smv']
0.5530013010538016
['targeted_productivity', 'department_sweing', 'team_11', 'idle_men', 'smv', 'incentive']
0.792823221566805
['targeted_productivity', 'department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'no_of_workers']
0.7986647279602157
['targeted_productivity', 'department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'no_of_workers', 'idle_time']
0.8011087556246014
['targeted_productivity', 'department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'no_of_workers', 'idle_time', 'over_time']
0.8048791450397842


The model above with only targeted productivity in the model has an adjusted r-squared of 97.7%

However, since it is a target, not a metric, it isn't useful for predicting productivity, so we will remove it from the model.

In [64]:
sigparams = sigparams.drop('targeted_productivity')
print(sigparams)

department_sweing    0.145683
team_11             -0.043256
idle_men            -0.006253
smv                 -0.003984
incentive            0.003253
no_of_workers        0.002039
idle_time            0.000614
over_time           -0.000003
Name: coefficient, dtype: float64


Now we will re-run the analysis

In [65]:
il = []
for x in range(0, (sigparams.count())):
    if x == 0:
        model = sm.OLS(endog= data_ols.actual_productivity, exog = data_ols.loc[:, data_ols.columns==sigparams.index[0]])
        results = model.fit()
        print(il)
        print(results.rsquared_adj)
        il.append(sigparams.index[0])
    else:
        il.append(sigparams.index[x])
        model = sm.OLS(endog= data_ols.actual_productivity, exog = data_ols[il])
        results = model.fit()
        print(il)
        print(results.rsquared_adj)

[]
0.0
['department_sweing', 'team_11']
0.006763257386616206
['department_sweing', 'team_11', 'idle_men']
0.0733043559579889
['department_sweing', 'team_11', 'idle_men', 'smv']
0.10661507975692541
['department_sweing', 'team_11', 'idle_men', 'smv', 'incentive']
0.6693164460449099
['department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'no_of_workers']
0.6691637981027212
['department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'no_of_workers', 'idle_time']
0.6699514156001276
['department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'no_of_workers', 'idle_time', 'over_time']
0.6812456091514572


Every addition above increased the adjusted r squared value, except for no_of_workers.

Best features:
'department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'idle_time', 'over_time'

3. Compare the performance of OLS and SGD implementation of linear model


I definitely messed something up here... tried adapting the model you gave us, but I think I changed something somewhere and can't find it again

In [68]:
#final OLS model
model = sm.OLS(endog= data_ols.actual_productivity, exog = data_ols[['department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'idle_time', 'over_time']])
results = model.fit()
print(results.summary())

                             OLS Regression Results                            
Dep. Variable:     actual_productivity   R-squared:                       0.682
Model:                             OLS   Adj. R-squared:                  0.680
Method:                  Least Squares   F-statistic:                     245.1
Date:                 Tue, 13 Sep 2022   Prob (F-statistic):          1.09e-166
Time:                         23:37:40   Log-Likelihood:                 705.61
No. Observations:                  691   AIC:                            -1397.
Df Residuals:                      684   BIC:                            -1365.
Df Model:                            6                                         
Covariance Type:             nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
department_sweing     0.6141

In [67]:
data_new_ols = sm.add_constant(data_new)
print(data_new_ols)
ols_output = results.get_prediction(data_new_ols).summary_frame(alpha=0.05)  

ols_output_list = ols_output['mean'].tolist()
ols_output_list

[[1. 0.]
 [1. 1.]
 [1. 2.]
 [1. 3.]
 [1. 4.]
 [1. 5.]
 [1. 6.]]


ValueError: shapes (7,2) and (7,) not aligned: 2 (dim 1) != 7 (dim 0)

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
reg = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3))
reg.fit(data_ols[['department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'idle_time', 'over_time']], data_ols.actual_productivity)

y_pred = reg.predict(data_ols[['department_sweing', 'team_11', 'idle_men', 'smv', 'incentive', 'idle_time', 'over_time']])
y_pred

array([0.95480834, 0.74270098, 0.79226308, 0.77496485, 0.69514231,
       0.72061068, 0.69309807, 0.72061068, 0.71624055, 0.73948359,
       0.79226308, 0.53049124, 0.74588397, 0.74451493, 0.74504187,
       0.80185362, 0.72281631, 0.77237841, 0.74758392, 0.69514231,
       0.78755507, 0.73774694, 0.49284246, 0.69876001, 0.77297736,
       0.78874321, 0.78957396, 0.72539099, 0.75752087, 0.80185362,
       0.7438304 , 0.77297736, 0.68992149, 0.73774694, 0.69876001,
       0.60418777, 0.85429524, 0.74724166, 0.69480005, 0.80185362,
       0.72135549, 0.77169388, 0.77169388, 0.74246136, 0.74246136,
       0.71992616, 0.73774694, 0.63558613, 0.8341018 , 0.8341018 ,
       0.72278131, 0.72278131, 0.74077322, 0.78247512, 0.67546226,
       0.67546226, 0.73482517, 0.69990385, 0.67041258, 0.66817658,
       0.8341018 , 0.8341018 , 0.77160523, 0.77959307, 0.72278131,
       0.7818241 , 0.74077322, 0.78042155, 0.72790388, 0.80478709,
       0.69413439, 0.62503933, 0.89091355, 0.8341018 , 0.83100

In [None]:
plt.plot(data_new, ols_output_list, "r-", linewidth=2, label="Statmodels Predictions")
plt.plot(data_new, sk_output, "g-", linestyle='dashed',  linewidth=2, label="Sklearn Predictions")
plt.plot(data_new, y_pred, "black", linestyle='dashdot',  linewidth=4, label="SGDRegressor Predictions")


plt.plot(data, output, "b.")
plt.legend(loc="upper left", fontsize=14)
plt.show()

NameError: name 'data_new' is not defined

4.