## MBAN 5110 Midterm Project 
By: Reva Sandhir 215686405

## Part 1:

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import statsmodels.api as sm
from scipy.optimize import minimize
from scipy.stats import chi2

from statsmodels.sandbox.regression.gmm import IV2SLS
from statsmodels.sandbox.regression.gmm import GMM
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder  # Import OneHotEncoder

In [2]:
%pip install numpy pandas statsmodels

Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/revasandhir/schulich_data_science/refs/heads/main/midterm_partone.csv')


In [4]:
# Instrumental Variable Regression for Inventory Turnover
model_iv = sm.OLS(df["Inventory Turnover"],
                  df[["Constant", "Current Ratio", "Quick Ratio", "Debt Asset Ratio"]]).fit()
endog_predict = model_iv.predict(df[["Constant", "Current Ratio", "Quick Ratio", "Debt Asset Ratio"]])
df["Endogenous Param"] = endog_predict

In [5]:
# First-Stage Regression for Endogenous Parameter (2SLS)
model_iv = sm.OLS(df["Inventory Turnover"],
                  df[["Constant", "Current Ratio", "Quick Ratio", "Debt Asset Ratio"]]).fit()
endog_predict = model_iv.predict(df[["Constant", "Current Ratio", "Quick Ratio", "Debt Asset Ratio"]])
df["Endogenous Param"] = endog_predict

In [6]:
# Define the GMM Model with δ in the Moment Conditions
class CustomGMM(GMM):
    def momcond(self, params):
        # Unpack parameters including delta
        p0, p1, p2, p3, delta = params
        
        # Endogenous and exogenous variables
        endog = self.endog
        exog = self.exog
        inst = self.instrument

        # Error terms for each moment condition
        error0 = endog - (p0 + p1 * exog[:, 0] + p2 * exog[:, 1] + p3 * exog[:, 2] + delta * exog[:, 0] * exog[:, 1])  
        error1 = error0 * exog[:, 1]
        error2 = error0 * exog[:, 2]
        error3 = error0 * inst[:, 0]
        error4 = error0 * inst[:, 1]
        error5 = error0 * inst[:, 2]

        # Combine all error terms
        g = np.column_stack((error0, error1, error2, error3, error4, error5))
        return g

In [7]:
# Set Up Data for GMM Estimation
y_vals = np.array(df["Stock Change"])
x_vals = np.array(df[["Endogenous Param", "Operating Profit", "Interaction Effect"]])
iv_vals = np.array(df[["Current Ratio", "Quick Ratio", "Debt Asset Ratio"]])

In [8]:
# Initial Guess for Parameters (Including δ)
beta0 = np.array([0.1, 0.1, 0.1, 0.1, 0.05])  # Initial values including delta

# Fit the GMM Model
gmm_model = CustomGMM(endog=y_vals, exog=x_vals, instrument=iv_vals, k_moms=6, k_params=5)
gmm_results = gmm_model.fit(beta0)

# Display GMM Summary
print(gmm_results.summary())

Optimization terminated successfully.
         Current function value: 0.000000
         Iterations: 8
         Function evaluations: 16
         Gradient evaluations: 16
Optimization terminated successfully.
         Current function value: 0.000001
         Iterations: 14
         Function evaluations: 19
         Gradient evaluations: 19
Optimization terminated successfully.
         Current function value: 0.000001
         Iterations: 6
         Function evaluations: 9
         Gradient evaluations: 9
Optimization terminated successfully.
         Current function value: 0.000001
         Iterations: 0
         Function evaluations: 1
         Gradient evaluations: 1
                              CustomGMM Results                               
Dep. Variable:                      y   Hansen J:                     0.001185
Model:                      CustomGMM   Prob (Hansen J):                 0.973
Method:                           GMM                                         
Dat

## Part Two:

In [9]:
df = pd.read_csv('https://raw.githubusercontent.com/revasandhir/schulich_data_science/refs/heads/main/midterm_parttwo.csv')

In [10]:
# Preprocess the categorical variables (convert to dummy variables)
df = pd.get_dummies(df, drop_first=True)

In [11]:
# Split the dataset into training and test sets (50/50)
X = df.drop('Credit Rating_Positive', axis=1)  # Independent variables
y = df['Credit Rating_Positive']  # Dependent variable (1 if Positive, 0 if Negative)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [12]:
# Fit Logistic Regression Model
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [13]:
# Predict probabilities on the test set
y_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for the 'Positive' class

In [14]:
# Convert probabilities to predicted class based on default threshold (0.5)
y_pred = (y_probs >= 0.5).astype(int)

In [15]:
# Evaluate the model with default threshold
conf_matrix = confusion_matrix(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion Matrix (default threshold):")
print(conf_matrix)
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

Confusion Matrix (default threshold):
[[   0  577]
 [   0 3464]]
Recall: 1.0
Precision: 0.8572135609997525
F1 Score: 0.9231179213857429


In [16]:
# Adjust the threshold to ensure only 15% applications are approved
# Find the threshold corresponding to 15% approval rate
threshold = np.percentile(y_probs, 85)

In [17]:
# Predict with the new threshold
y_pred_adjusted = (y_probs >= threshold).astype(int)

In [18]:
# Evaluate the model with the adjusted threshold
conf_matrix_adjusted = confusion_matrix(y_test, y_pred_adjusted)
recall_adjusted = recall_score(y_test, y_pred_adjusted)
precision_adjusted = precision_score(y_test, y_pred_adjusted)
f1_adjusted = f1_score(y_test, y_pred_adjusted)

print("\nConfusion Matrix (adjusted threshold):")
print(conf_matrix_adjusted)
print(f"Recall: {recall_adjusted}")
print(f"Precision: {precision_adjusted}")
print(f"F1 Score: {f1_adjusted}")


Confusion Matrix (adjusted threshold):
[[ 494   83]
 [2939  525]]
Recall: 0.15155889145496534
Precision: 0.8634868421052632
F1 Score: 0.25785854616895876
