In [164]:
### Project MAP569 by Romain ALBRAND and Clément BEAULIEU ###

In [165]:
### Import packages

import numpy as np
import math
import pandas as pd
from sklearn.preprocessing import LabelEncoder
#from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split


In [166]:
### Read  CSV File and upload data in arrays

# feature_names = ['Id_Customer', 'Y', 'Customer_Type', 'BirthDate', 'Customer_Open_Date', 'P_Client', 'Educational_Level', 'Marital_Status', 'Number_Of_Dependant', 'Years_At_Residence', 'Net_Annual_Income', 'Years_At_Business', 'Prod_Sub_Category', 'Prod_Decision_Date', 'Source', 'Type_Of_Residence', 'Nb_Of_Products', 'Prod_Closed_Date', 'Prod_Category']

dataframe_train = pd.read_csv('/Users/clementbeaulieu/Desktop/MAP569/projectMAP569/CreditTraining.csv', header = 0)
dataframe_train.head()


Unnamed: 0,Id_Customer,Y,Customer_Type,BirthDate,Customer_Open_Date,P_Client,Educational_Level,Marital_Status,Number_Of_Dependant,Years_At_Residence,Net_Annual_Income,Years_At_Business,Prod_Sub_Category,Prod_Decision_Date,Source,Type_Of_Residence,Nb_Of_Products,Prod_Closed_Date,Prod_Category
0,7440,0,Non Existing Client,07/08/1977,13/02/2012,NP_Client,University,Married,3.0,1,36,1.0,C,14/02/2012,Sales,Owned,1,,B
1,573,0,Existing Client,13/06/1974,04/02/2009,P_Client,University,Married,0.0,12,18,2.0,C,30/06/2011,Sales,Parents,1,,G
2,9194,0,Non Existing Client,07/11/1973,03/04/2012,NP_Client,University,Married,2.0,10,36,1.0,C,04/04/2012,Sales,Owned,1,,B
3,3016,1,Existing Client,08/07/1982,25/08/2011,NP_Client,University,Married,3.0,3,36,1.0,C,07/09/2011,Sales,New rent,1,31/12/2012,L
4,6524,0,Non Existing Client,18/08/1953,10/01/2012,NP_Client,University,Married,2.0,1,36,1.0,C,11/01/2012,Sales,Owned,1,,D


In [167]:
########## Convert dates into numerical features ###########

### Convert Date columns into datatime format

dataframe_train['BirthDate'] = pd.to_datetime(dataframe_train['BirthDate'])
dataframe_train['Customer_Open_Date'] = pd.to_datetime(dataframe_train['Customer_Open_Date'])
dataframe_train['Prod_Decision_Date'] = pd.to_datetime(dataframe_train['Prod_Decision_Date'])
dataframe_train['Prod_Closed_Date'] = pd.to_datetime(dataframe_train['Prod_Closed_Date'])

# Specific case of Prod_Closed_Date dealing with NaT values. NaT values handled with the 1900-01-01 value for now.
null_date = pd.to_datetime('19000101', format='%Y%m%d', errors='ignore')
dataframe_train.replace({pd.NaT : null_date}, inplace=True)


### Modify dates into numerical features : reduce the date into year or split it into many features (2 or 3 to handle the year, the month and the day index of the year).

# BirthDate : reduced to year.
dataframe_train['BirthDate'] = dataframe_train['BirthDate'].dt.year

# Customer_Open_Date : split into year, month and day index of the year
dataframe_train['Customer_Open_Date_year'] = dataframe_train['Customer_Open_Date'].dt.year
dataframe_train['Customer_Open_Date_month'] = dataframe_train['Customer_Open_Date'].dt.month
dataframe_train['Customer_Open_Date_dayofyear'] = dataframe_train['Customer_Open_Date'].dt.dayofyear
del dataframe_train['Customer_Open_Date']

# Prod_Decision_Date : split into year, month and day index of the year
dataframe_train['Prod_Decision_Date_year'] = dataframe_train['Prod_Decision_Date'].dt.year
dataframe_train['Prod_Decision_Date_month'] = dataframe_train['Prod_Decision_Date'].dt.month
dataframe_train['Prod_Decision_Date_dayofyear'] = dataframe_train['Prod_Decision_Date'].dt.dayofyear
del dataframe_train['Prod_Decision_Date']

# Prod_Closed Date : split into year, month and day index of the year
dataframe_train['Prod_Closed_Date_year'] = dataframe_train['Prod_Closed_Date'].dt.year
dataframe_train['Prod_Closed_Date_month'] = dataframe_train['Prod_Closed_Date'].dt.month
dataframe_train['Prod_Closed_Date_dayofyear'] = dataframe_train['Prod_Closed_Date'].dt.dayofyear

def handle_year(i):
    if (i == np.int64(1900)):
        return np.int64(0)
    else:
        return i
    
def handle_month(year, month):
    if (year > np.int64(0)):
        return month
    else:
        return np.int64(0)
    
def handle_dayofyear(year, dayofyear):
    if (year > np.int64(0)):
        return dayofyear
    else:
        return np.int64(0)

# Missing Prod_Closed_Date values are transformed into 0 values for year, month and dayofyear. Existing values remain the same.
dataframe_train.Prod_Closed_Date_year = dataframe_train.Prod_Closed_Date_year.apply(handle_year)
dataframe_train.Prod_Closed_Date_month = dataframe_train.apply(lambda x: handle_month(x.Prod_Closed_Date_year, x.Prod_Closed_Date_month), axis = 1)
dataframe_train.Prod_Closed_Date_dayofyear = dataframe_train.apply(lambda x: handle_dayofyear(x.Prod_Closed_Date_year, x.Prod_Closed_Date_dayofyear), axis = 1)

del dataframe_train['Prod_Closed_Date']


In [168]:
#dataframe_train.dtypes
#dataframe_train

In [169]:
### Convert Categorical features into numerical features. If there are K Classes, values go from 0 to K-1 values (ordinal encoding).

cat_features = ['Customer_Type', 'P_Client', 'Educational_Level', 'Marital_Status', 'Prod_Sub_Category', 'Source', 'Type_Of_Residence', 'Prod_Category']
encoders = dict()

for feature_name in cat_features:
    encoder = LabelEncoder()
    encoders[feature_name] = encoder
    encoder.fit(dataframe_train[feature_name])
    #encoder.classes_
    dataframe_train[feature_name] = encoder.transform(dataframe_train[feature_name])
    #dataframe_train[feature_name]

### Checking right encoders creation.

# Prints encoders classes.
#for feature_name in cat_features: print(encoders[feature_name].classes_)

# Syntax to print inverse transform list of a given encoder.
#print(list(encoders['Customer_Type'].inverse_transform(dataframe_train['Customer_Type'])))

In [170]:
#dataframe_train.dtypes
#dataframe_train

In [171]:
### Defining models

