In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statistics
import imblearn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score,classification_report,confusion_matrix
from scipy.stats import probplot
from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


In [None]:
df_train_original = pd.read_csv('Credit Score.csv')
df_train = df_train_original.copy()
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe(exclude=np.number).T

In [None]:
df_train.isnull().sum()

In [None]:
#Drop kolom
drop_columns = ['ID','Customer_ID','Name','SSN']
df_train.drop(drop_columns,axis=1,inplace=True)

In [None]:
#Drop kolom karena outliers
drop_columns = ['Amount_invested_monthly']
df_train.drop(drop_columns,axis=1,inplace=True)

In [None]:
# #Drop kolom karena multicolinearity
# drop_columns = ['Annual_Income','Monthly_Inhand_Salary']
# df_train.drop(drop_columns,axis=1,inplace=True)

In [None]:
numericals = df_train.select_dtypes(include='number').columns.tolist()
categorical = df_train.select_dtypes(include='object').columns.tolist()

print(f"Numerical columns are {numericals}")
print(f"Categorical columns are {categorical}")

In [None]:
#detail kolom 
def get_column_details(df,column):
    print("Details of",column,"column")
    
    #Tipe data
    print("\nTipe Data: ",df[column].dtype)
    
    #Kosong atau tidak ?
    count_null = df[column].isnull().sum()
    if count_null==0:
        print("\nTidak ada value yang kosong")
    elif count_null>0:
        print("\nAda ",count_null," null values")
        
    #Get Number of Unique Values
    print("\nUnik: ",df[column].nunique())
    
    #Get Distribution of Column    
    print("\nDistribution of column:\n")
    print(df[column].value_counts())

In [None]:
#mengisi missing value
def fill_missing_with(df, column):      
    print("\nSebelum diisi:",df[column].isnull().sum())
    
    df[column]=df[column].fillna(df[column].median())
    
    print("\nSesudah diisi:",df[column].isnull().sum())

In [None]:
#mengisi missing value dengan groupby
def fill_groupby(df, column):
    groupby='Occupation'      
    print("\nSebelum diisi:",df[column].isnull().sum())
    per_group = df.groupby(groupby)[column].transform(lambda x: x.mean())
    df[column] = df[column].fillna(per_group)
    print("\nSesudah diisi:",df[column].isnull().sum())

In [None]:
def fill_imputer(df, column):
    if df[column].dtypes == object:
        imputer = SimpleImputer(strategy='most_frequent')
        df[column] = imputer.fit_transform(df[[column]])
    else:
        imputer = IterativeImputer()
        df[column] = imputer.fit_transform(df[[column]])

In [None]:
def fill_missing(df, column):
    #fill_groupby(df_train, column_name)
    fill_missing_with(df_train, column_name)
    #fill_imputer(df_train, column_name)

In [None]:
#Label Encoding
def ubah_label(df, column):
    from sklearn.preprocessing import LabelEncoder

    categorical_columns = [column]
    # Initialize the LabelEncoder
    label_encoder = LabelEncoder()
    df_train[column] = label_encoder.fit_transform(df_train[column])
    print(label_encoder.classes_)

In [None]:
def regex(df, column, sampah=None,dummy=''):
    if sampah!=None:
        df[column] = df[column].replace(sampah,dummy)
        print(f"\nSampah {sampah} is replaced with Blank")

In [None]:
#desimal Scaling
def dec_scaling(df, column):
    df[column] = df[column].values.reshape(len(df), 1)/100
    df[column]

In [None]:
#pengolahan categorical
def cat_process(df, column, sampah=None, dummy=''):
    regex(df, column, sampah,dummy)
    ubah_label(df, column)
    #dec_scaling(df_train, column_name)

In [None]:
#penanganan outliers menggunakan quantile
def anti_outliers(df, column):
    
    Q1=df[column].quantile(0.25)
    Q3=df[column].quantile(0.75)
    IQR=Q3-Q1
    print(Q1)
    print(Q3)
    print(IQR)
    lower = Q1-1.5*IQR
    upper = Q3+1.5*IQR
        
    df_train[column] = pd.DataFrame(np.where(df_train[column] >= upper, upper, 
        (np.where(df_train[column] <= lower, lower, df_train[column]))), columns=[column])
        



In [None]:
from scipy.stats.mstats import winsorize
def anti_outliers_winsorize(df, column):
    a=df[column]
    df[column]=winsorize(a, limits=[0.1, 0.2])

In [None]:
#Pengolahan Numerikal
def num_process(df, column, strip=None, datatype=None, sampah=None):
    #regex(df, column, sampah)
    #ngresiki strip
    if df[column].dtype == object and strip is not None:
        df[column] = df[column].str.strip(strip)
        print(f"\nTrailing & leading {strip} are removed")
        
    #ganti datatype
    if datatype is not None:
        df[column] = df[column].astype(datatype)
        print(f"\nDatatype of {column} is changed to {datatype}")
    anti_outliers(df_train, column_name)
    #anti_outliers_winsorize(df_train, column_name)
    #dec_scaling(df_train, column_name)

In [None]:
#plot outlier
def plot(df, column):
    print(f"Skewness of {column}:",df[column].skew())
    print(f"Kurtosis of {column}:",df[column].kurtosis())
    plt.figure(figsize=(14,4))
    plt.subplot(131)
    sns.histplot(df[column])
    plt.subplot(132)
    sns.boxplot(df[column])
    plt.subplot(133)
    probplot(df[column],rvalue=True,plot=plt,dist='norm')
    plt.suptitle(column)
    plt.show()

#categorical

In [None]:
#Credit Score
column_name = 'Credit_Score'

get_column_details(df_train, column_name)
cat_process(df_train, column_name)



In [None]:
#Occupation
column_name = 'Occupation'
sampah = '_______'

cat_process(df_train, column_name,sampah)
fill_missing(df_train, column_name)
get_column_details(df_train, column_name)
df_train
df_train['Occupation'].isnull().sum()


In [None]:
#Type_of_Loan
column_name = 'Type_of_Loan'
sampah = '_______'

cat_process(df_train, column_name,sampah)
fill_missing(df_train, column_name)
get_column_details(df_train, column_name)
df_train
df_train['Type_of_Loan'].isnull().sum()


In [None]:
#Credit_Mix
column_name = 'Credit_Mix'
sampah = '_'

cat_process(df_train, column_name,sampah)
fill_missing(df_train, column_name)
get_column_details(df_train, column_name)
df_train
df_train['Credit_Mix'].isnull().sum()


In [None]:
#Credit_History_Age
column_name = 'Credit_History_Age'
sampah = 'NA'

def Month_Converter(val):
    if pd.notnull(val):
        years = int(val.split(' ')[0])
        month = int(val.split(' ')[3])
        return (years*12)+month
    else:
        return val
    
df_train['Credit_History_Age'] = df_train['Credit_History_Age'].apply(lambda x: Month_Converter(x)).astype(float)
fill_missing(df_train, column_name)
print(df_train['Credit_History_Age'])
df_train['Credit_History_Age'].isnull().sum()

In [None]:
#Payment_Of_Min_Amount
column_name = 'Payment_of_Min_Amount'
sampah = 'NM'

cat_process(df_train, column_name,sampah)
fill_missing(df_train, column_name)
get_column_details(df_train, column_name)
df_train
df_train['Payment_of_Min_Amount'].isnull().sum()


In [None]:
#Payment_Behaviour
column_name = 'Payment_Behaviour'
sampah = '!@9#%8'
dummy = 'Unknown'

cat_process(df_train, column_name,sampah,dummy)
fill_missing(df_train, column_name)
get_column_details(df_train, column_name)
df_train
df_train['Payment_Behaviour'].isnull().sum()

In [None]:
#Month
df_train['Month'] = pd.to_datetime(df_train.Month, format='%B').dt.month
df_train['Month'].isnull().sum()

#Numerical

In [None]:
#Age
column_name = 'Age'

num_process(df_train, column_name, strip='_', datatype=int)
fill_missing(df_train, column_name)
plot(df_train, column_name)
df_train[df_train.Age <= 120].shape

In [None]:
#multicolinear
#Annual Income
column_name = 'Annual_Income'

num_process(df_train, column_name, strip='_', datatype=float)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#multicolinear
#Monthly_Inhand_Salary
column_name = 'Monthly_Inhand_Salary'

num_process(df_train, column_name, strip='_', datatype=float)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Num_Bank_Accounts
column_name = 'Num_Bank_Accounts'

num_process(df_train, column_name, strip='_', datatype=int)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Num_Credit_Card
column_name = 'Num_Credit_Card'

num_process(df_train, column_name, strip='_', datatype=int)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Interest_Rate
column_name = 'Interest_Rate'

num_process(df_train, column_name, strip='_', datatype=int)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Num_of_Loan
column_name = 'Num_of_Loan'

num_process(df_train, column_name, strip='_', datatype=int)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Delay_from_due_date
column_name = 'Delay_from_due_date'

num_process(df_train, column_name, strip='_', datatype=int)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Num_of_Delayed_Payment
column_name = 'Num_of_Delayed_Payment'

num_process(df_train, column_name, strip='_', datatype=float)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Changed_Credit_Limit
column_name = 'Changed_Credit_Limit'
sampah = '_'
regex(df_train, column_name, sampah)
df_train['Changed_Credit_Limit']=pd.to_numeric(df_train['Changed_Credit_Limit'], errors='coerce')
num_process(df_train, column_name,  strip='_',datatype='float')
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Num_Credit_Inquiries
column_name = 'Num_Credit_Inquiries'

num_process(df_train, column_name, strip='_', datatype=float)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Outstanding_Debt
column_name = 'Outstanding_Debt'

num_process(df_train, column_name, strip='_', datatype=float)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Credit_Utilization_Ratio
column_name = 'Credit_Utilization_Ratio'

num_process(df_train, column_name, strip='_', datatype=float)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
#Total_EMI_per_month
column_name = 'Total_EMI_per_month'

num_process(df_train, column_name, strip='_', datatype=float)
fill_missing(df_train, column_name)
plot(df_train, column_name)

In [None]:
# #note : didrop karena outliers
# #Amount_invested_monthly
# column_name = 'Amount_invested_monthly'


# num_process(df_train, column_name, strip='_', datatype=float)
# fill_missing(df_train, column_name)
# plot(df_train, column_name)

In [None]:
#Monthly_Balance
column_name = 'Monthly_Balance'

num_process(df_train, column_name, strip='_', datatype=float)
fill_missing(df_train, column_name)
plot(df_train, column_name)

#Multicolinearity

In [None]:
from sklearn.model_selection import train_test_split
feature = df_train.drop(columns='Credit_Score')
target = df_train[['Credit_Score']]

feature_cs_train, feature_cs_test, target_cs_train, target_cs_test = train_test_split(feature, target, test_size=0.20, random_state=42)

In [None]:
#pengecekan VIF (variance inflation factor)
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif 
from statsmodels.tools.tools import add_constant

X=add_constant(feature_cs_train)

vif_df=pd.DataFrame([vif(X.values, i)
                     for i in range(X.shape[1])],
                    index=X.columns).reset_index()
vif_df.columns=['feature','vif_score']
vif_df=vif_df.loc[vif_df.feature!='const']
vif_df

In [None]:
# df_train.to_excel('creditscore_output.xlsx', engine='xlsxwriter')  

In [None]:
df_train.isnull().sum()

In [None]:
#Split Input & Output Data
X = df_train.drop('Credit_Score',axis=1)
y = df_train['Credit_Score']
print(X.shape)
print(y.shape)

In [None]:
smote = SMOTE() # Synthetic Minority Oversampling TEchnique
X, y = smote.fit_resample(X,y)
print(X.shape)
print(y.shape)

In [None]:
#Normalize Data
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = MinMaxScaler()
scaler2 = StandardScaler()
X = scaler2.fit_transform(X)

In [None]:
#Split Data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#Klasifikasi

In [None]:
#Method to evaluate the performance of the model
def evaluate_model(y_test,y_pred):
    print("Classification Report")
    print(classification_report(y_test, y_pred))
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Create a heatmap of the confusion matrix using Seaborn
    sns.heatmap(cm, annot=True, cmap='Greens',fmt='.0f')

    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')

    plt.show()
    jclass = len(np.unique(y_test))
    tp = np.zeros(jclass)
    tn = np.zeros(jclass)
    fp = np.zeros(jclass)
    fn = np.zeros(jclass)
    for kelas in range(jclass):
        tp[kelas] = cm[kelas,kelas]
        fn[kelas] = np.sum(cm[kelas, :]) - tp[kelas]
        fp[kelas] = np.sum(cm[:, kelas]) - tp[kelas]
        tn[kelas] = np.sum(cm)-tp[kelas]-fn[kelas]-fp[kelas]
    df_train=pd.DataFrame({'Kelas ': np.unique(y_test), 'TP' : tp, 'TN': tn, 'FP': fp, 'FN': fn })
    specificity = tn/(tn+fp)
    avg_specificity = specificity.mean()
    print(df_train)
    print(f'Average Specificity : {avg_specificity}')
    
        
        
    

In [None]:
#klasifikasi dan evaluasi
classifiers = [('KNN',KNeighborsClassifier(n_neighbors=17)), ('Decision Tree',DecisionTreeClassifier())]



    
# Calculate average performance metrics
for model, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    akurasi = accuracy_score(y_test, y_pred)
    presisi = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f_score = 2 * (presisi * recall) / (presisi + recall)

        
        
    # Print the performance metrics
    print(f'Classifier: {model}')
    evaluate_model(y_test, y_pred)
    print(f'Accuracy: {akurasi}')
    print(f'Precision: {presisi}')
    print(f'Recall: {recall}')
    print(f'F score: {f_score}')
    print('------------------------------------------------------------')
    print('------------------------------------------------------------')