In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import outils_data

In [None]:
df = pd.read_csv('Loan Data.csv')
df.sample(5)

---
**Identification des variables catégorielles et numériques**

In [None]:
df.info()

In [None]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns.to_list()
numeric_columns.remove('Credit_History')
numeric_columns

In [None]:
categorical_columns = df.select_dtypes(exclude=['float64', 'int64']).columns.tolist()
categorical_columns.append('Credit_History')
categorical_columns.remove('Loan_ID')
categorical_columns

---
Description du dataframe, identification des valeurs nulles

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.describe(exclude=np.number)

In [None]:
df.isna().sum()

---
Analyses univariées

In [None]:
for column in numeric_columns:
    outils_data.distribution_curve(df, column)
    
    if (outils_data.normal_distribution(df, column, 0.05)) :
        print(f"La variable {column} suit une distribution normale.")
    else:
        print(f"La variable {column} ne suit pas une distribution normale.")

outils_data.draw_boxplots(numeric_columns, df)

In [None]:
for column in categorical_columns :
    print(f"{column} : {df[column].unique()}")

In [None]:
df_inconnu = df.copy()
for column in categorical_columns :
    df_inconnu[column] = df_inconnu[column].fillna("Inconnu")
    
# Calculer les pourcentages d'occurrence
for column in categorical_columns :
    print("\n")
    pourcentages = df_inconnu[column].value_counts(normalize=True) * 100
    pourcentages = pourcentages.round(2)
    print(pourcentages)

In [None]:
# Remplacement des valeurs NaN par le mode
df_na_filled = df.copy()
for column in categorical_columns :
    column_mode = df[column].mode()[0]
    df_na_filled[column] = df_na_filled[column].fillna(column_mode)

In [None]:
outils_data.draw_pieplot(df_na_filled,'Gender')

In [None]:
outils_data.draw_pieplot(df_na_filled,'Married')

In [None]:
outils_data.draw_barplot(df_na_filled,'Dependents')

In [None]:
outils_data.draw_pieplot(df_na_filled,'Education')

In [None]:
outils_data.draw_pieplot(df_na_filled,'Self_Employed')

In [None]:
outils_data.draw_pieplot(df_na_filled,'Property_Area')

In [None]:
outils_data.draw_pieplot(df_na_filled,'Loan_Status')

In [None]:
outils_data.draw_pieplot(df_na_filled,'Credit_History')

---
**Analyses bivariées**

In [None]:
df_corr = df[numeric_columns]
correl = df_corr.corr()
correl.style.background_gradient(cmap='coolwarm')

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df_corr.corr(), fmt='.2g', annot=True)

In [None]:
outils_data.chi_squared_tests(df_na_filled, categorical_columns)

Remplaçons les valeurs nulles par la médiane pour les colonnes LoanAmount et Loan_Amount_Term

In [None]:
med_loan_amount = df_na_filled['LoanAmount'].median()
df_na_filled['LoanAmount'] = df_na_filled['LoanAmount'].fillna(med_loan_amount)

In [None]:
med_loan_amount_term = df_na_filled['Loan_Amount_Term'].median()
df_na_filled['Loan_Amount_Term'] = df_na_filled['Loan_Amount_Term'].fillna(med_loan_amount_term)

Pour chacune des colonnes numériques, on va essayer de voir le nombre d'outliers en utilisant les fonctions du module outils_data

In [None]:
df_to_use = df_na_filled.copy()

# Nous choisissons trois méthodes et nous imposons un seuil de 2 pour que ce soit un outlier
df_with_outliers = outils_data.dataframe_outliers(df_to_use, numeric_columns, outils_data.iqr, outils_data.zscore, cont=0.02)

for column in numeric_columns :
    df_outliers = outils_data.outliers_by_column(df_with_outliers, column, threshold=2)
    med = df_to_use[column].median()
    if df_to_use[column].dtype == 'int64' :
        med = int(med)
        
    for index, row in df_outliers.iterrows() :
        loan_id = row['Loan_ID']
        df_to_use.loc[df_to_use['Loan_ID']==loan_id, column] = med

**Mise en oeuvre des algorithmes**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df_to_use.info()

In [None]:
# Encodage avec le One-Hot Encoding pour Property Area
df_to_use = pd.get_dummies(df_to_use, columns=["Property_Area"])
df_to_use.sample(10)

In [None]:
# Changer manuellement les autres types de colonnes
df_to_use.loc[df_to_use['Dependents']=='3+', 'Dependents'] = 3
df_to_use['Dependents'] = df_to_use['Dependents'].astype(int)

df_to_use.loc[df_to_use['Gender']=='Male', 'Gender'] = 0
df_to_use.loc[df_to_use['Gender']=='Female', 'Gender'] = 1
df_to_use['Gender'] = df_to_use['Gender'].astype(int)

df_to_use.loc[df_to_use['Married']=='No', 'Married'] = 0
df_to_use.loc[df_to_use['Married']=='Yes', 'Married'] = 1
df_to_use['Married'] = df_to_use['Married'].astype(int)

df_to_use.loc[df_to_use['Self_Employed']=='No', 'Self_Employed'] = 0
df_to_use.loc[df_to_use['Self_Employed']=='Yes', 'Self_Employed'] = 1
df_to_use['Self_Employed'] = df_to_use['Self_Employed'].astype(int)

df_to_use.loc[df_to_use['Education']=='Not Graduate', 'Education'] = 0
df_to_use.loc[df_to_use['Education']=='Graduate', 'Education'] = 1
df_to_use['Education'] = df_to_use['Education'].astype(int)

df_to_use.loc[df_to_use['Loan_Status']=='N', 'Loan_Status'] = 0
df_to_use.loc[df_to_use['Loan_Status']=='Y', 'Loan_Status'] = 1
df_to_use['Loan_Status'] = df_to_use['Loan_Status'].astype(int)

df_to_use['Credit_History'] = df_to_use['Credit_History'].astype(int)

df_to_use.sample(10)

In [None]:
# Division des données en features et target
X = df_to_use.drop(["Loan_ID", "Loan_Status"], axis=1).values
y = df_to_use["Loan_Status"].values

In [None]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Faire de l'oversampling pour rééquilibrer les classes
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=21)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

*Régression logistique*

In [None]:
# Entraînement du modèle
lr_model = LogisticRegression()
lr_model.fit(X_resampled, y_resampled)

In [None]:
# Prédiction et évaluation
y_pred = lr_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")

---
*Random forest*

In [None]:
# Initialisation et entraînement du modèle
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_resampled, y_resampled)

In [None]:
# Prédiction et évaluation
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")

---
*Algorithme des K Plus Proches Voisins (KNN)*

In [None]:
# Première expérimentation avec k=5
# Initialisation et entraînement du modèle
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
# Prédiction et évaluation
y_pred = knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")

In [None]:
# Première expérimentation avec k=3
# Initialisation et entraînement du modèle
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

# Prédiction et évaluation
y_pred = knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")

In [None]:
# Première expérimentation avec k=7
# Initialisation et entraînement du modèle
knn_model = KNeighborsClassifier(n_neighbors=7)
knn_model.fit(X_train, y_train)

# Prédiction et évaluation
y_pred = knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")

---
*Support Vector Machines*

In [None]:
# Initialisation et entraînement du modèle (linéaire)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Prédiction et évaluation
y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")

In [None]:
# Initialisation et entraînement du modèle (sigmoid)
svm_model = SVC(kernel='sigmoid')
svm_model.fit(X_train, y_train)

# Prédiction et évaluation
y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")

In [None]:
# Initialisation et entraînement du modèle (polynomial)
svm_model = SVC(kernel='poly')
svm_model.fit(X_train, y_train)

# Prédiction et évaluation
y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")

---
*Réseaux de neurones artificiels*

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Initialisation du modèle séquentiel
ann_model = Sequential()

# Ajoutez des couches au modèle
ann_model.add(Dense(units=64, activation='relu', input_shape=(X_train.shape[1],)))
ann_model.add(Dense(units=32, activation='relu'))
ann_model.add(Dense(units=1, activation='sigmoid'))  # Couche de sortie pour la classification binaire

In [None]:
# Compilation du modèle
ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Entraînement du modèle
ann_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

In [None]:
# Évaluation des performances du modèle
loss, accuracy = ann_model.evaluate(X_test, y_test)
report = classification_report(y_test, y_pred)

print(f"Accuracy : {accuracy}")
print(f"Classification report : \n{report}")