In [1]:
import numpy as np
import pandas as pd
import matplotlib as mp
import statsmodels.api as sm

from statsmodels.sandbox.regression.gmm import IV2SLS 
# There is a package named IV2SLS in Python. Do not use this package! The exogenous explanatory variables must
# be entered as instruments. So it gives wrong answers
from statsmodels.sandbox.regression.gmm import GMM

In [2]:
input_table = pd.read_csv('small_retailers_stock_performance.csv')
input_table.head()

Unnamed: 0,Constant,Stock Change,Inventory Turnover,Operating Profit,Interaction Effect,Current Ratio,Quick Ratio,Debt Asset Ratio
0,1,0.870332,1.795946,0.115846,0.208053,1.672527,0.255171,0.473317
1,1,-0.047347,1.395501,0.436967,0.609788,1.637261,0.221763,0.489967
2,1,0.001176,1.664563,0.541016,0.900555,1.640619,0.189141,0.374269
3,1,-0.9012,1.605738,0.539399,0.866133,1.436221,0.131944,0.224399
4,1,-0.176353,1.591451,0.539938,0.859285,1.43314,0.183095,0.213446


In [3]:
model_iv = sm.OLS(input_table["Inventory Turnover"],input_table[["Constant","Current Ratio","Quick Ratio",\
                                                                 "Debt Asset Ratio"]]).fit()
endog_predict = model_iv.predict(input_table[["Constant","Current Ratio","Quick Ratio","Debt Asset Ratio"]])
input_table["Endogenous Param"] = endog_predict

In [4]:
model_2sls = sm.OLS(input_table["Stock Change"], input_table[["Constant","Endogenous Param",\
                                                              "Operating Profit","Interaction Effect",\
                                                             ]]).fit()
model_2sls.summary()

0,1,2,3
Dep. Variable:,Stock Change,R-squared:,0.015
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,8.53
Date:,"Fri, 10 Nov 2023",Prob (F-statistic):,1.27e-05
Time:,17:05:30,Log-Likelihood:,-1186.5
No. Observations:,1696,AIC:,2381.0
Df Residuals:,1692,BIC:,2403.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Constant,-0.0176,0.020,-0.896,0.370,-0.056,0.021
Endogenous Param,0.0011,0.001,1.827,0.068,-7.76e-05,0.002
Operating Profit,-0.1201,0.028,-4.319,0.000,-0.175,-0.066
Interaction Effect,0.0014,0.000,3.621,0.000,0.001,0.002

0,1,2,3
Omnibus:,368.832,Durbin-Watson:,2.243
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3433.92
Skew:,0.742,Prob(JB):,0.0
Kurtosis:,9.811,Cond. No.,109.0


In [5]:
y_vals  = np.array(input_table["Stock Change"])
x_vals  = np.array(input_table[["Inventory Turnover","Operating Profit","Interaction Effect"]])
iv_vals = np.array(input_table[["Current Ratio","Quick Ratio","Debt Asset Ratio"]])

class delta_gmm(GMM):
    def momcond(self, params):
        p0, p1, p2, p3, delta = params
        endog = self.endog
        exog = self.exog
        inst = self.instrument   

        error0 = endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2] - delta * inst[:,0]        
        error1 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * exog[:,1]
        error2 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * exog[:,2]
        error3 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * inst[:,0] 
        error4 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * inst[:,1] 
        error5 = (endog - p0 - p1 * exog[:,0] - p2 * exog[:,1] - p3 * exog[:,2]) * inst[:,2] 

        g = np.column_stack((error0, error1, error2, error3, error4, error5))
        return g


delta_beta = np.array([0.1, 0.1, 0.1, 0.1, 0.1])
res = delta_gmm(endog = y_vals, exog = x_vals, instrument = iv_vals, k_moms=6, k_params=4).fit(delta_beta)

res.summary()


Optimization terminated successfully.
         Current function value: 0.000005
         Iterations: 9
         Function evaluations: 14
         Gradient evaluations: 14
Optimization terminated successfully.
         Current function value: 0.000202
         Iterations: 8
         Function evaluations: 13
         Gradient evaluations: 13
Optimization terminated successfully.
         Current function value: 0.000202
         Iterations: 6
         Function evaluations: 10
         Gradient evaluations: 10
Optimization terminated successfully.
         Current function value: 0.000202
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5


0,1,2,3
Dep. Variable:,y,Hansen J:,0.3424
Model:,delta_gmm,Prob (Hansen J):,0.558
Method:,GMM,,
Date:,"Fri, 10 Nov 2023",,
Time:,17:05:30,,
No. Observations:,1696,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
p 0,-0.0201,0.021,-0.965,0.334,-0.061,0.021
p 1,0.0009,0.001,1.259,0.208,-0.000,0.002
p 2,-0.1051,0.032,-3.284,0.001,-0.168,-0.042
p 3,0.0012,0.000,2.801,0.005,0.000,0.002
p 4,0.0024,0.004,0.539,0.590,-0.006,0.011


The coefficients for "Current Ratio" and the constant term are not statistically significant.
The coefficients for "Quick Ratio" and "Debt Asset Ratio" are statistically significant, suggesting that changes in these ratios are associated with changes in the "Stock Change."
The "Interaction Effect" coefficient is not statistically significant at the 0.05 significance level.

## PART TWO

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report



In [14]:
df = pd.read_csv('midterm_parttwo.csv')
df.head()



Unnamed: 0,Years of Education after High School,Requested Credit Amount,Number of Dependents,Monthly Income,Monthly Expense,Marital Status,Credit Rating
0,1,Low,No dependent,Very low,Very low,Married,Positive
1,2,Low,No dependent,Very low,Very low,Single,Positive
2,1,Low,No dependent,Very low,Very low,Single,Positive
3,3,Low,No dependent,Very low,Very low,Married,Positive
4,3,Low,No dependent,Very low,Very low,Single,Negative


In [15]:
# Define features (X) and target variable (y)
X = df[['Requested Credit Amount', 'Number of Dependents', 'Marital Status', 
        'Monthly Income', 'Monthly Expense', 'Years of Education after High School']]
y = df['Credit Rating']

y = y.map({'Negative': 0, 'Positive': 1})

# Define columns for one-hot encoding and standard scaling
categorical_columns = ['Requested Credit Amount', 'Number of Dependents', 'Marital Status',
                        'Monthly Income', 'Monthly Expense']
numerical_columns = ['Years of Education after High School']

# Create the preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_columns),
        ('scaler', StandardScaler(), numerical_columns)
    ],
    remainder='passthrough'  # Pass through any columns not specified
)

# Create a pipeline with the preprocessor and the logistic regression model
log_reg = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

# Split the dataset into training and test sets (50% each)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Fit the pipeline on the training set
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
class_rep = classification_report(y_test, y_pred, zero_division=1)

print(conf_matrix)
print(class_rep)



[[   0  577]
 [   0 3464]]
              precision    recall  f1-score   support

           0       1.00      0.00      0.00       577
           1       0.86      1.00      0.92      3464

    accuracy                           0.86      4041
   macro avg       0.93      0.50      0.46      4041
weighted avg       0.88      0.86      0.79      4041



In [16]:
y_proba = log_reg.predict_proba(X_test)[:, 1]
desired_approval_rate = 0.15
threshold = np.percentile(y_proba, 100 * (1 - desired_approval_rate))

# Update predictions based on the new threshold
y_pred_adjusted = (y_proba > threshold).astype(int)

# Evaluate the model with the adjusted threshold
conf_matrix_adjusted = confusion_matrix(y_test, y_pred_adjusted)
class_rep_adjusted = classification_report(y_test, y_pred_adjusted, zero_division=1)

# Display the updated results
print(f"Threshold for {desired_approval_rate*100}% approval rate: {threshold:.4f}")
print("\nConfusion Matrix (Adjusted):")
print(conf_matrix_adjusted)
print("\nClassification Report (Adjusted):")
print(class_rep_adjusted)

Threshold for 15.0% approval rate: 0.8875

Confusion Matrix (Adjusted):
[[ 496   81]
 [2939  525]]

Classification Report (Adjusted):
              precision    recall  f1-score   support

           0       0.14      0.86      0.25       577
           1       0.87      0.15      0.26      3464

    accuracy                           0.25      4041
   macro avg       0.51      0.51      0.25      4041
weighted avg       0.76      0.25      0.26      4041



In [12]:
y_test

7587   -1
2069   -1
7621    1
2063    1
7733    1
       ..
7727    1
2121   -1
5108    1
136     1
6338    1
Name: Credit Rating, Length: 4041, dtype: int64