# Scrips of Projects

## 1.Script Preparación de Datos de Entrenamiento y Validación

In [82]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os

In [83]:
# Leemos la tabla de entrenamiento
df = pd.read_csv("./data/raw/churn_dataset.csv").set_index('id')

In [84]:
# Codificamos los valores categoricos
label_encoder = LabelEncoder()
df['Geography'] =  label_encoder.fit_transform(df['Geography'])
#test['Geography'] =  label_encoder.fit_transform(test['Geography'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])
#test['Gender'] = label_encoder.fit_transform(test['Gender'])


In [85]:
# Eliminamos variable no relevante
df = df.drop('Surname',axis=1)
#test = test.drop('Surname',axis=1)
df.drop(['CustomerId'], axis=1, inplace=True)
#test.drop(['CustomerId'], axis=1, inplace=True)

In [86]:
# Transformamos para reducir el sesgo por los atípicos
columns=['CreditScore','Balance','EstimatedSalary','Age','NumOfProducts']
for col in columns:
    df[col]=winsorize(df[col],limits=[0.05,0.1],inclusive=(True,True))
    #test[col]=winsorize(test[col],limits=[0.05,0.1],inclusive=(True,True))

In [87]:
df.head(2)

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,668,0,1,33.0,3,0.0,2,1.0,0.0,176843.53,0
1,627,0,1,33.0,1,0.0,2,1.0,1.0,49503.5,0


In [88]:
# Sepramos la data en Train y Test
X = df.drop(['Exited'],axis=1)
y = df[['Exited']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [89]:
# Dataset de Train (80%) y Test (20%)
train=pd.concat([X_train,y_train],axis=1)
test=pd.concat([X_test,y_test],axis=1)

In [90]:
# Guardamos la data sólo las variables relevantes para el Modelo
train.to_csv("./data/processed/churn_train.csv")
test.to_csv("./data/processed/churn_test.csv")

## 2.Script de Entrenamiento

In [23]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pickle

In [24]:
# Cargar la tabla transformada
df = pd.read_csv("./data/processed/churn_train.csv").set_index('id')
X_train = df.drop(['Exited'],axis=1)
y_train = df[['Exited']]

In [25]:
# Balancemos los datos de Train
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [26]:
# Entrenamos el modelo con toda la muestra
lgb = LGBMClassifier(random_state=42)
lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 104061, number of negative: 104061
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007988 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1526
[LightGBM] [Info] Number of data points in the train set: 208122, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [27]:
# Guardamos el modelo entrenado para usarlo en produccion
filename = './models/best_model.pkl'
pickle.dump(lgb, open(filename, 'wb'))

## 3.Script de Validación

In [42]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import *
import matplotlib.pyplot as plt
import pickle

In [29]:
# Cargar la tabla transformada
df = pd.read_csv("./data/processed/churn_test.csv").set_index('id')

In [43]:
# Leemos el modelo entrenado!
filename = './models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [44]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
y_pred_test=model.predict(df.drop(['Exited'],axis=1)) 

In [32]:
## Metricas de validación
def calc_metrics(y_test,y_pred_test):
    cm_test = confusion_matrix(y_test,y_pred_test)
    print("Matriz de confusion: ")
    print(cm_test)
    accuracy_test=accuracy_score(y_test,y_pred_test)
    print("Accuracy: ", accuracy_test)
    precision_test=precision_score(y_test,y_pred_test)
    print("Precision: ", precision_test)
    recall_test=recall_score(y_test,y_pred_test)
    print("Recall: ", recall_test)

In [49]:
calc_metrics(df['Exited'],y_pred_test)

Matriz de confusion: 
[[24227  1825]
 [ 3095  3860]]
Accuracy:  0.8509407095464598
Precision:  0.6789797713280563
Recall:  0.5549964054636952


In [46]:
#plot_confusion_matrix(model, X_test, y_test)
#save_plot('Confusion Matrix')

In [48]:
#y_prob = model.predict_proba(X_test)
#kds.metrics.report(df['Exited'], y_prob[:,1],plot_style='ggplot')

## 4.Script Preparación de Datos del Score

In [50]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import LabelEncoder
import os

In [51]:
# Leemos la tabla de score
score = pd.read_csv("./data/raw/churn_score.csv").set_index('id')

In [52]:
# Codificamos los valores categoricos
label_encoder = LabelEncoder()
score['Geography'] =  label_encoder.fit_transform(score['Geography'])
score['Gender'] = label_encoder.fit_transform(score['Gender'])

In [53]:
# Eliminamos variable no relevante
score = score.drop('Surname',axis=1)
score.drop(['CustomerId'], axis=1, inplace=True)

In [54]:
# Transformamos para reducir el sesgo por los atípicos
columns=['CreditScore','Balance','EstimatedSalary','Age','NumOfProducts']
for col in columns:
    score[col]=winsorize(score[col],limits=[0.05,0.1],inclusive=(True,True))

In [55]:
# Mantener sólo las variables relevantes para el Modelo
score.to_csv("./data/processed/churn_score.csv")

## 5.Script de Scoring

In [60]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import *
import matplotlib.pyplot as plt
import pickle

In [65]:
# Cargar la tabla transformada
df = pd.read_csv("./data/processed/churn_score.csv").set_index('id')

In [66]:
# Leemos el modelo entrenado!
filename = './models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [68]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
scores=model.predict(df).reshape(-1,1)

In [81]:
# Exportamos el resultado del modelo para cargarlo en el Feature Store o Data Mart de Modelos
# Le asignamos nombres a las columnas
df_score = pd.DataFrame(scores, columns=['PREDICT'])
# Exportamos la solucion
df_score.to_csv('./data/scores/final_score.csv')