## **GROUP ASSIGNMENT 1**

| Name      | UOW ID |
| ----------- | ----------- |
| Calaunan Alexander Jr Sumampong      | 7559161       |
| Deon Cham Hui Ern   | 7559471        |
| Elroy Chua Ming Xuan | 7431673 |
| Gonzales Raizel Vera Marie L. | 7436634 |

## Discover and Visualise the data

In [None]:
#imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# read the data file
df = pd.read_csv('data.csv')

# Be able to see all the columns
pd.set_option('display.max_columns', None)

# see the first 20 columns
df.head(20)

In [None]:
# Check datatype of all cols in df
df.dtypes

In [None]:
# Display statistics of the data
df.describe()

In [None]:
# Show all columns that have missing values
df.isnull().sum()

## Visualizing the Data

In [None]:
## @raizel TO-DO data visualization, perform the following:
### 1. Plot the distribution of the target variable and each of the features 
### 2. Plot the correlation matrix 
### 3. Plot the boxplots of the target variable and each of the features
### 4. Plot the scatterplots of the target variable versus each of the features 
## ^ all suggestions, you can pick and choose depending on what you think is important and edit as you see fit

### Perform Correlations

In [None]:
from scipy.stats import pointbiserialr
import scipy.stats

# List of numerical features for point-biserial correlation
numerical_features = [
    'loan_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'emp_length',
    'annual_inc', 'dti', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
    'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
    'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee',
    'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
    'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
    'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim'
]

# List of categorical features for Cramer's V correlation
categorical_features = [
    'grade', 'sub_grade', 'emp_length', 'verification_status', 'pymnt_plan', 'purpose',
    'delinq_2yrs', 'initial_list_status', 'policy_code', 'application_type'
]

# # Compute Cramer's V correlations with the target attribute for categorical features
# Define Cramer's V function
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

# Compute Cramer's V for all the categorical features
cramers_v_features = pd.Series(index=categorical_features)
for column in categorical_features:
    cramers_v_features[column] = cramers_v(df[column], df['default_ind'])

# Compute point-biserial correlations with the target attribute for numerical features
point_biserial_correlations = pd.Series(index=numerical_features)
for column in numerical_features:
    point_biserial_correlations[column] = pointbiserialr(df[column], df['default_ind'])[0]

# Sort the correlations in descending order
cramers_v_features_sorted = cramers_v_features.sort_values(ascending=False)
point_biserial_correlations_sorted = point_biserial_correlations.sort_values(ascending=False)

# Display the sorted results
print("\nPoint-Biserial Correlations (sorted):")
print(point_biserial_correlations_sorted)

# Display the sorted results
print("\nCramer's V Correlation (sorted):")
print(cramers_v_features_sorted)

# Combine the results into a single DataFrame
all_correlations = pd.concat([point_biserial_correlations_sorted, cramers_v_features_sorted])
all_correlations = all_correlations.sort_values(ascending=False)

print("\nAll Correlations (sorted):")
print(all_correlations)

### Correlations Visualised

In [None]:
# HEATMAP OF CORRELATIONS
# Step 1: Create a new DataFrame with the combined correlation values
correlation_df = pd.DataFrame({'correlation': all_correlations})

# Step 2: Create a Seaborn heatmap to visualize the correlation matrix
plt.figure(figsize=(1, 12))
sns.heatmap(correlation_df, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Heatmap with Default_ind')
plt.show()

## Admin stuff before we start
- Change the values of 'term' column into int type
- Change all the dates from str to datetime type

In [None]:
# Specify the data types for columns with mixed types
dtype_dict = {'pymnt_plan': 'str', 'last_pymnt_amnt': 'float', 'dti_joint': 'float'}

# Import the CSV file with specified data types
df = pd.read_csv('data.csv', dtype=dtype_dict)

# Change the values of 'term' into integers
df['term'] = df['term'].str.replace('months', '').astype(int)

# Chnage the type of 'issue_d' to datetime
df['issue_d'] = pd.to_datetime(df['issue_d'])
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])
df['last_pymnt_d'] = pd.to_datetime(df['last_pymnt_d'])
df['next_pymnt_d'] = pd.to_datetime(df['next_pymnt_d'])
df['last_credit_pull_d'] = pd.to_datetime(df['last_credit_pull_d'])

## Preprocessing Pipeline 

- Drop ID columns (id, member_id, emp_title, title, zip_code, desc) 
- Deal with missing data 
    - Remove mostly-null columns (cols with >50% missing data)
    - Median impute (via SimpleImputer)
- OneHotEncode categorical (nominal) columns; ordinal for categorical (ordinal) data  
- Scale data (via StandardScaler)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
import pandas as pd

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.num_imputer = SimpleImputer(strategy="median")
        self.cat_imputer = SimpleImputer(strategy="most_frequent")
        self.oneHot_encoder = OneHotEncoder(handle_unknown='ignore')
        self.ordinal_encoder = OrdinalEncoder()
        self.scaler = StandardScaler()

        # Categorical columns
        self.cat_cols = None
        self.ord_cols = None

        # Numerical columns
        self.num_cols = None

    def drop_rdnt_columns(self, X):
        # Drop redundant columns
        return X.drop(['zid', 'member_id', 'emp_title', 'title', 'zip_code', 'desc'], axis=1)
    
    def drop_mostly_emp_columns(self, X):
        # Drop columns with >50% missing values
        cols_to_drop = X.columns[X.isnull().mean() > 0.5]
        return X.drop(cols_to_drop, axis=1, inplace=True)
    
    def fit(self, X, y=None):
        # Find categorical and numeric columns
        self.cat_cols = X.select_dtypes(include='object').columns.tolist()
        self.num_cols = X.select_dtypes(include='number').columns.tolist()

        # List of ordinal columns
        self.ord_cols = ['emp_length', 'grade', 'sub_grade']

        # Remove ordinal columns from categorical columns
        self.cat_cols = [col for col in self.cat_cols if col not in self.ord_cols]

        # print categorical and numeric columns
        print("Categorical (nominal) columns: ", self.cat_cols)
        print("Numeric columns: ", self.num_cols)
        
        # Fill missing values for numerical columns
        X[self.num_cols] = self.num_imputer.fit_transform(X[self.num_cols])
        # Fill missing values for ordinal columns
        X[self.ord_cols] = self.cat_imputer.fit_transform(X[self.ord_cols])
        # Fill missing values for categorical (nominal) columns
        X[self.cat_cols] = self.cat_imputer.fit_transform(X[self.cat_cols])

        # fit the encoder for nominal columns
        self.oneHot_encoder.fit(X[self.cat_cols])
        # fit the ordinal encoder
        self.ordinal_encoder.fit(X[self.ord_cols])
        # Fit the StandardScaler
        self.scaler.fit(X[self.num_cols])
        
        return self
    
    def transform(self, X, y=None):
        # Impute missing data in numeric columns
        X[self.num_cols] = self.num_imputer.transform(X[self.num_cols])
        
        # One-hot encode categorical columns
        categorical_data = self.oneHot_encoder.transform(X[self.cat_cols]).toarray()
        X = X.drop(columns=self.cat_cols)
        X = pd.concat([X, pd.DataFrame(categorical_data, columns=self.oneHot_encoder.get_feature_names_out(self.cat_cols))], axis=1)
        
        # Ordinal encode ordinal columns
        X[self.ord_cols] = self.ordinal_encoder.transform(X[self.ord_cols])

        # Scale numeric data
        X[self.num_cols] = self.scaler.transform(X[self.num_cols])
        
        return X
    
    def fit_transform(self, X, y=None):
        # Drop columns and remove mostly-null columns before fitting and transforming 
        X = self.drop_rdnt_columns(X)
        self.drop_mostly_emp_columns(X)
        
        return self.fit(X, y).transform(X, y)

In [None]:
# Transforming data with a custom transformer

# Instantiate and use the CustomTransformer
transformer = CustomTransformer()

X = df.drop(columns=['default_ind'])  # Features
y = df['default_ind']  # Target column

# Fit and transform the data
df = transformer.fit_transform(X, y)

In [None]:
# Check the head of the processed data
df.head() 

In [None]:
# Check for missing values of the processed data 
df.isnull().sum()

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and the target column (y)
X = df.drop(columns=['default_ind'])  # Features
y = df['default_ind']  # Target column

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the train and test sets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Select and Train Models