# ML sur données de credit

In [1]:
import pandas as pd
import numpy as np

## Importer les données (import + jointure)

In [2]:
credit1 = pd.read_csv("./data/credit1.txt",sep="\t")
credit2 = pd.read_excel("./data/credit2.xlsx")
credit_global = pd.merge(credit1,credit2,on = "Customer_ID")

In [3]:
credit_global.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 31 columns):
Months_as_a_Customer               2000 non-null int64
Number_of_Products                 2000 non-null int64
RFM_Score                          2000 non-null int64
Average_Balance_Feed_Index         2000 non-null int64
Number_of_Transactions             2000 non-null int64
Personal_Debt_to_Equity_Ratio      2000 non-null int64
Months_Current_Account             2000 non-null int64
Number_of_Loan_Accounts            2000 non-null int64
Customer_ID                        2000 non-null int64
Has_Bad_Payment_Record             2000 non-null int64
Members_Within_Household           2000 non-null int64
Number_of_Call_Center_Contacts     2000 non-null int64
Weeks_Since_Last_Offer             2000 non-null int64
Accepted_Personal_Loan             2000 non-null object
Accepted_Retention                 2000 non-null object
Accepted_Home_Equity_Loan          2000 non-null object
Accepted_C

In [4]:
credit_global["Accepted_Personal_Loan"].value_counts()

F    1742
T     258
Name: Accepted_Personal_Loan, dtype: int64

### Extraire les colonnes pour X et pour y

In [5]:
y = credit_global["Accepted_Personal_Loan"]

In [6]:
credit_global.columns

Index(['Months_as_a_Customer', 'Number_of_Products', 'RFM_Score',
       'Average_Balance_Feed_Index', 'Number_of_Transactions',
       'Personal_Debt_to_Equity_Ratio', 'Months_Current_Account',
       'Number_of_Loan_Accounts', 'Customer_ID', 'Has_Bad_Payment_Record',
       'Members_Within_Household', 'Number_of_Call_Center_Contacts',
       'Weeks_Since_Last_Offer', 'Accepted_Personal_Loan',
       'Accepted_Retention', 'Accepted_Home_Equity_Loan',
       'Accepted_Credit_Card', 'Annual_value', 'Interested_in_Personal_Loan',
       'Interested_in_Retention', 'Interested_in_Home_Equity_Loan',
       'Interested_in_Credit_Card', 'Age', 'Gender', 'Marital_Status',
       'Age_Youngest_Child', 'Number_of_Workers_in_Household',
       'Percentage_White_Collar_Workers', 'Household_Debt_to_Equity_Ratio',
       'Income', 'Homeowner'],
      dtype='object')

In [7]:
list_vars = [ 'Months_as_a_Customer', 'Number_of_Products', 'RFM_Score',
       'Average_Balance_Feed_Index', 'Number_of_Transactions',
       'Personal_Debt_to_Equity_Ratio', 'Months_Current_Account',
       'Number_of_Loan_Accounts', 'Has_Bad_Payment_Record',
       'Members_Within_Household', 'Number_of_Call_Center_Contacts',
       'Weeks_Since_Last_Offer','Age', 'Gender', 
       'Age_Youngest_Child', 'Number_of_Workers_in_Household',
       'Percentage_White_Collar_Workers', 'Household_Debt_to_Equity_Ratio',
       'Income', 'Homeowner' ]

In [8]:
x = credit_global[list_vars]

In [9]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 20 columns):
Months_as_a_Customer               2000 non-null int64
Number_of_Products                 2000 non-null int64
RFM_Score                          2000 non-null int64
Average_Balance_Feed_Index         2000 non-null int64
Number_of_Transactions             2000 non-null int64
Personal_Debt_to_Equity_Ratio      2000 non-null int64
Months_Current_Account             2000 non-null int64
Number_of_Loan_Accounts            2000 non-null int64
Has_Bad_Payment_Record             2000 non-null int64
Members_Within_Household           2000 non-null int64
Number_of_Call_Center_Contacts     2000 non-null int64
Weeks_Since_Last_Offer             2000 non-null int64
Age                                2000 non-null int64
Gender                             2000 non-null object
Age_Youngest_Child                 2000 non-null int64
Number_of_Workers_in_Household     2000 non-null int64
Percentage_W

### Séparation apprentissage / validation

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3)

In [12]:
x_train.shape

(1400, 20)

### Préparation des données

In [13]:
# on importe une classe
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [14]:
# on crée un objet
homeowner_transfo = LabelEncoder()
# on fait la même chose pour Gender
gender_transfo = LabelEncoder()

In [15]:
# on applique une méthode sur les données
x_train["Homeowner"]=homeowner_transfo.fit_transform(x_train["Homeowner"])
# on fait la même chose pour gender
x_train["Gender"]=gender_transfo.fit_transform(x_train["Gender"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [16]:
# on applique la transformation sur x_test
x_test["Homeowner"]=homeowner_transfo.transform(x_test["Homeowner"])
# on fait la même chose pour gender
x_test["Gender"]=gender_transfo.transform(x_test["Gender"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


### Ajustement des modèles

In [17]:
# on importe les classes
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [18]:
# on crée les objets
modele_logit = LogisticRegression()
modele_rf = RandomForestClassifier()
modele_gbm = GradientBoostingClassifier()
modele_svm = SVC()
modele_nn = MLPClassifier()

In [19]:
# on ajuste les modèles avec les données (.fit())
modele_logit.fit(x_train,y_train)
modele_rf.fit(x_train,y_train)
modele_gbm.fit(x_train,y_train)
modele_svm.fit(x_train,y_train)
modele_nn.fit(x_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [20]:
pd.DataFrame(modele_logit.coef_,index=["coef"],
             columns = x.columns).T.sort_values("coef")

Unnamed: 0,coef
Number_of_Workers_in_Household,-0.036186
Percentage_White_Collar_Workers,-0.020452
Household_Debt_to_Equity_Ratio,-0.011641
Number_of_Transactions,-0.003747
Personal_Debt_to_Equity_Ratio,-0.003726
Age,-0.003726
Number_of_Call_Center_Contacts,-0.002608
Months_Current_Account,-0.001653
Gender,-0.001648
Number_of_Loan_Accounts,-0.001646


In [21]:
pd.DataFrame(modele_rf.feature_importances_,columns=["coef"],
             index = x.columns).sort_values("coef",ascending=False)

Unnamed: 0,coef
Income,0.102362
Weeks_Since_Last_Offer,0.099618
Months_Current_Account,0.098325
Age,0.077841
Personal_Debt_to_Equity_Ratio,0.070941
Household_Debt_to_Equity_Ratio,0.065447
Percentage_White_Collar_Workers,0.062038
Number_of_Workers_in_Household,0.056061
Age_Youngest_Child,0.055424
Members_Within_Household,0.048573


### Validation des modèles

In [22]:
# on prédit sur les données de test
y_pred_logit = modele_logit.predict(x_test)
y_pred_rf = modele_rf.predict(x_test)
y_pred_gbm = modele_gbm.predict(x_test)
y_pred_svm = modele_svm.predict(x_test)
y_pred_nn = modele_nn.predict(x_test)

In [23]:
# on récupère des indicateurs de qualité prédictive
from sklearn.metrics import accuracy_score, confusion_matrix

In [24]:
print("Pourcentage de bien classé pour le modèle logit : {}".format(
    accuracy_score(y_test, y_pred_logit)))
print("Pourcentage de bien classé pour le modèle RF : {}".format(
    accuracy_score(y_test, y_pred_rf)))
print("Pourcentage de bien classé pour le modèle GBM : {}".format(
    accuracy_score(y_test, y_pred_gbm)))
print("Pourcentage de bien classé pour le modèle SVM : {}".format(
    accuracy_score(y_test, y_pred_svm)))
print("Pourcentage de bien classé pour le modèle NN : {}".format(
    accuracy_score(y_test, y_pred_nn)))

Pourcentage de bien classé pour le modèle logit : 0.865
Pourcentage de bien classé pour le modèle RF : 0.8566666666666667
Pourcentage de bien classé pour le modèle GBM : 0.8583333333333333
Pourcentage de bien classé pour le modèle SVM : 0.865
Pourcentage de bien classé pour le modèle NN : 0.7916666666666666


In [25]:
print("Matrice de confusion pour le modèle logit :\n {}".format(
    confusion_matrix(y_test, y_pred_logit)))
print("Matrice de confusion pour le modèle RF :\n {}".format(
    confusion_matrix(y_test, y_pred_rf)))
print("Matrice de confusion pour le modèle GBM :\n {}".format(
    confusion_matrix(y_test, y_pred_gbm)))
print("Matrice de confusion pour le modèle SVM :\n {}".format(
    confusion_matrix(y_test, y_pred_svm)))
print("Matrice de confusion pour le modèle NN :\n {}".format(
    confusion_matrix(y_test, y_pred_nn)))

Matrice de confusion pour le modèle logit :
 [[519   0]
 [ 81   0]]
Matrice de confusion pour le modèle RF :
 [[513   6]
 [ 80   1]]
Matrice de confusion pour le modèle GBM :
 [[515   4]
 [ 81   0]]
Matrice de confusion pour le modèle SVM :
 [[519   0]
 [ 81   0]]
Matrice de confusion pour le modèle NN :
 [[469  50]
 [ 75   6]]
