### Importing the Libraries

In [2]:
import pandas as pd
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

### Extracting the data from the Tables

In [5]:
dbname = 'loan_management'
user = 'postgres'
password = 'simi280498'
host = '127.0.0.1'
port = '5432'

In [7]:
#Establishing the connection
engine = create_engine(f'postgresql://{user}:{password}@{host}/{dbname}')

In [9]:
# Define your SELECT query
query = '''SELECT * FROM borrower as b 
         inner join loan_to_income as lti 
         on b.borrower_id = lti.borrower_id 
         inner join loan as l
         on lti.loan_id = l.loan_id'''

In [11]:
df = pd.read_sql_query(query, engine)
print("Data loaded successfully!")

Data loaded successfully!


In [12]:
df.head()

Unnamed: 0,borrower_id,age,income,credit_score,months_employed,num_credit_lines,dti_ratio,education,employment_type,marital_status,...,has_cosigner,loan_id,borrower_id.1,loan_amount_to_income,loan_id.1,loan_amount,interest_rate,loan_term,loan_purpose,defaulter
0,BZOPZLF57NR,67,77135,529,59,4,0.41,High School,Full-time,Married,...,0,ZOPZLF57NR,BZOPZLF57NR,2.072198,ZOPZLF57NR,159839,4.14,36,Other,0
1,B648L7QUFVO,57,34830,416,93,3,0.45,Bachelor's,Unemployed,Married,...,0,648L7QUFVO,B648L7QUFVO,1.810853,648L7QUFVO,63072,20.34,60,Home,0
2,BZY51VR44DK,21,142860,318,35,3,0.36,High School,Part-time,Married,...,1,ZY51VR44DK,BZY51VR44DK,1.670993,ZY51VR44DK,238718,13.06,48,Business,0
3,BR8P5YIW07D,19,20236,411,66,4,0.3,Bachelor's,Self-employed,Divorced,...,0,R8P5YIW07D,BR8P5YIW07D,8.158035,R8P5YIW07D,165086,11.49,60,Education,0
4,BMZRL2WMB52,23,17142,802,56,3,0.71,High School,Full-time,Married,...,1,MZRL2WMB52,BMZRL2WMB52,6.444347,MZRL2WMB52,110469,12.23,12,Auto,0


In [13]:
engine.dispose()

In [14]:
df.shape

(255347, 23)

In [19]:
#dropping columns which are not required
df = df.drop(['borrower_id', 'loan_id'], axis=1)

In [21]:
df.head()

Unnamed: 0,age,income,credit_score,months_employed,num_credit_lines,dti_ratio,education,employment_type,marital_status,income_per_credit_line,has_mortgage,has_dependents,has_cosigner,loan_amount_to_income,loan_amount,interest_rate,loan_term,loan_purpose,defaulter
0,67,77135,529,59,4,0.41,High School,Full-time,Married,19283.75,1,1,0,2.072198,159839,4.14,36,Other,0
1,57,34830,416,93,3,0.45,Bachelor's,Unemployed,Married,11610.0,0,0,0,1.810853,63072,20.34,60,Home,0
2,21,142860,318,35,3,0.36,High School,Part-time,Married,47620.0,0,1,1,1.670993,238718,13.06,48,Business,0
3,19,20236,411,66,4,0.3,Bachelor's,Self-employed,Divorced,5059.0,0,1,0,8.158035,165086,11.49,60,Education,0
4,23,17142,802,56,3,0.71,High School,Full-time,Married,5714.0,1,1,1,6.444347,110469,12.23,12,Auto,0


In [23]:
df.shape

(255347, 19)

In [25]:
# Taking 25K samples from the data for hypothesis testing
df= df.sample(n=25000, random_state=42, replace=True).copy()

### 1. Null Hypothesis - There is no significant difference in Income between the two default groups.

In [28]:
default_group = df[df['defaulter'] == 1]['income']
no_default_group = df[df['defaulter'] == 0]['income']

In [30]:
t_stat, p_value = stats.ttest_ind(default_group, no_default_group)

In [32]:
print("Hypothesis 1: Income and Default Status")
print(f"T-statistic: {t_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("Reject the Null Hypothesis: There is a significant difference in Income between the two groups.")
else:
    print("Fail to Reject the Null Hypothesis: There is no significant difference in Income between the two groups.")

Hypothesis 1: Income and Default Status
T-statistic: -17.46104596584434, P-value: 7.1771991878512575e-68
Reject the Null Hypothesis: There is a significant difference in Income between the two groups.


### 2. Null Hypothesis - There is no significant difference in Age between the two default groups.

In [35]:
default_group = df[df['defaulter'] == 1]['age']
no_default_group = df[df['defaulter'] == 0]['age']

In [37]:
t_stat, p_value = stats.ttest_ind(default_group, no_default_group)

In [39]:
print("Hypothesis 2: Age and Default Status")
print(f"T-statistic: {t_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("Reject the Null Hypothesis: There is a significant difference in Age between the two groups.")
else:
    print("Fail to Reject the Null Hypothesis: There is no significant difference in Age between the two groups.")

Hypothesis 2: Age and Default Status
T-statistic: -27.667954386200247, P-value: 5.3796668954208473e-166
Reject the Null Hypothesis: There is a significant difference in Age between the two groups.


### 3. Null Hypothesis - There is no significant difference in Credit Score between the two default groups.

In [42]:
default_group = df[df['defaulter'] == 1]['credit_score']
no_default_group = df[df['defaulter'] == 0]['credit_score']

In [44]:
t_stat, p_value = stats.ttest_ind(default_group, no_default_group)

In [46]:
print("Hypothesis 3: Credit Score and Default Status")
print(f"T-statistic: {t_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("Reject the Null Hypothesis: There is a significant difference in Credit Score between the two groups.")
else:
    print("Fail to Reject the Null Hypothesis: There is no significant difference in Credit Score between the two groups.")

Hypothesis 3: Credit Score and Default Status
T-statistic: -5.496275417455252, P-value: 3.916834058584063e-08
Reject the Null Hypothesis: There is a significant difference in Credit Score between the two groups.


### 4. Null Hypothesis - There is no significant difference in Months Employed between the two default groups.

In [49]:
default_group = df[df['defaulter'] == 1]['months_employed']
no_default_group = df[df['defaulter'] == 0]['months_employed']

In [51]:
t_stat, p_value = stats.ttest_ind(default_group, no_default_group)

In [53]:
print("Hypothesis 4: Months Employed and Default Status")
print(f"T-statistic: {t_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("Reject the Null Hypothesis: There is a significant difference in Months Employed between the two groups.")
else:
    print("Fail to Reject the Null Hypothesis: There is no significant difference in Months Employed between the two groups.")

Hypothesis 4: Months Employed and Default Status
T-statistic: -16.153670819881178, P-value: 2.1150902025915106e-58
Reject the Null Hypothesis: There is a significant difference in Months Employed between the two groups.


### 5. Null Hypothesis - There is no significant difference in Loan amount between the two default groups

In [56]:
default_group = df[df['defaulter'] == 1]['loan_amount']
no_default_group = df[df['defaulter'] == 0]['loan_amount']

In [58]:
t_stat, p_value = stats.ttest_ind(default_group, no_default_group)

In [60]:
print("Hypothesis 5: Loan Amount and Default Status")
print(f"T-statistic: {t_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("Reject the Null Hypothesis: There is a significant difference in Loan Amount between the two groups.")
else:
    print("Fail to Reject the Null Hypothesis: There is no significant difference in Loan Amount between the two groups.")

Hypothesis 5: Loan Amount and Default Status
T-statistic: 15.091082597204297, P-value: 3.1183675041166574e-51
Reject the Null Hypothesis: There is a significant difference in Loan Amount between the two groups.


### 6. Null Hypothesis - There is no significant difference in Interest Rate between the two default groups

In [63]:
default_group = df[df['defaulter'] == 1]['interest_rate']
no_default_group = df[df['defaulter'] == 0]['interest_rate']

In [65]:
t_stat, p_value = stats.ttest_ind(default_group, no_default_group)

In [67]:
print("Hypothesis 6: Number of Interest Rate and Default Status")
print(f"T-statistic: {t_stat}, P-value: {p_value}")
if p_value < 0.05:
    print("Reject the Null Hypothesis: There is a significant difference in Interest Rate between the two groups.")
else:
    print("Fail to Reject the Null Hypothesis: There is no significant difference in Interest Rate between the two groups.")

Hypothesis 6: Number of Interest Rate and Default Status
T-statistic: 20.7553817388394, P-value: 6.92501174334232e-95
Reject the Null Hypothesis: There is a significant difference in Interest Rate between the two groups.
