# Scripts del Proyecto

### Script 1: Preparacion de datos para el entrenamiento

In [2]:
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [28]:
df = pd.read_csv("../data/raw/data.csv")

In [29]:
# Eliminar las primeras dos columnas con data innecesaria
df.drop(df.iloc[:, [0,1]], axis=1, inplace=True)

#  Columnas con valores categoricos pueden ser del tipo 'object' o 'int64'
categorical_indexes = [0, 1, 3, 4] + list(range(6,20))
df.iloc[:, categorical_indexes] = df.iloc[:, categorical_indexes].astype('category')

# Como el porcentaje de los calores vacios es menor al 1% podriamos eliminar o reemplaxar por la mediana estos valores
df = df.dropna()

In [30]:
# Dividir la data en numerica y categorica
numerical_columns = [c for c in df.columns if df[c].dtype.name != 'category']
numerical_columns.remove('satisfaction')
categorical_columns = [c for c in df.columns if df[c].dtype.name == 'category']
df_describe = df.describe(include = ['category'])

# Divir las columnas categoricas en binarias y no binarias
binary_columns = [c for c in categorical_columns if df_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if df_describe[c]['unique'] > 2]

In [31]:
# Codificación
original_df = df[binary_columns].copy()
df[binary_columns] = df[binary_columns].astype('category').apply(lambda x: x.cat.codes)
df[binary_columns] = df[binary_columns].astype('category')

# Tabla de equivalencia
list_transformation = []
list_transformation = [(col, val, code) for col in binary_columns
                                        for val, code in zip(original_df[col].unique(), df[col].unique())]

df_transform = pd.DataFrame(list_transformation, columns=["Nombre columna", "Valor Original", "Categoria"])

In [32]:
df_nonbinary = pd.get_dummies(df[nonbinary_columns])
df_numerical = df[numerical_columns]
df_numerical = (df_numerical - df_numerical.mean(axis = 0))/df_numerical.std(axis = 0)
df_final = pd.concat((df_numerical, df_nonbinary, df[binary_columns], df['satisfaction']), axis = 1)

In [40]:
feature_nonbinary_columns = df_final.columns.to_list()
df_columns = pd.DataFrame(feature_nonbinary_columns, columns=["Columns"])
df_columns.to_csv("../data/processed/columns_train.csv", index=False)

In [41]:
df_final.to_csv("../data/processed/data_train.csv", index=False)

### Script 2: Código de Entrenamiento

In [1]:
import numpy as np 
import pandas as pd
import pickle
import warnings

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../data/processed/data_train.csv")

In [3]:
y = df['satisfaction']
X = df.drop(['satisfaction'],axis=1)
N, d = X.shape

In [5]:
# Se obtiene el Target y las caracteristicas
features = X.columns.tolist()

In [8]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size = 36000, test_size = 4000,
                                                    stratify = y, random_state = 2023)

In [9]:
y_test.to_csv("../data/processed/target_validate.csv", index=False)
X_test.to_csv("../data/processed/features_validate.csv", index=False)

#### Modelo de Neural Network

In [10]:
alpha_local_opt = 5.623413251903491
hidden_layer_local_opt_1 = 40
hidden_layer_local_opt_2 = 21

In [11]:
# Se corre el modelo con los hiperparametros optimos y se corre con la data para ver los resultados estadisticos del modelo.
mlp_model = MLPClassifier(alpha = alpha_local_opt,
                          hidden_layer_sizes = (hidden_layer_local_opt_1, hidden_layer_local_opt_2),
                          solver = 'lbfgs',
                          max_iter = 1000,
                          activation = 'logistic',
                          random_state = 42)

mlp_model.fit(X_train, y_train)

In [12]:
# Save Best Model
filename = '../models/best_model.pkl'
pickle.dump(mlp_model, open(filename, 'wb'))

### Script 4: Código de Validación

In [8]:
import numpy as np
import pandas as pd
import pickle
import warnings

from sklearn.metrics import roc_auc_score, \
                            accuracy_score, \
                            f1_score, \
                            precision_score, \
                            recall_score, \
                            classification_report, \
                            confusion_matrix

warnings.filterwarnings("ignore")

In [9]:
y_test = pd.read_csv("../data/processed/target_validate.csv")
X_test = pd.read_csv("../data/processed/features_validate.csv")
features=X_test.columns.tolist()

In [11]:
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [12]:
# Usando el modelo para predecir

X_test['probability']  = model.predict_proba(X_test[features])[:,1]
X_test['prediction']  = model.predict(X_test[features])

In [13]:
#Resumen de todas las métricas del modelo
metricsRfc = pd.DataFrame({'metric':['AUC','Gini','Accuracy','Precision','Recall','F1-score'],
                                'nn_test':[roc_auc_score(y_test, X_test.probability),
                                        (roc_auc_score(y_test, X_test.probability)*2-1),
                                        accuracy_score(y_test, X_test.prediction),
                                        precision_score(y_test, X_test.prediction, pos_label='satisfied'),
                                        recall_score(y_test, X_test.prediction, pos_label='satisfied'),
                                        f1_score(y_test, X_test.prediction, pos_label='satisfied')]})

### Script 5: Preparación de Datos de Score (Automatización)

In [1]:
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../data/raw/data_score.csv")
columns_all = pd.read_csv("../data/processed/columns_train.csv")

In [4]:
# Eliminar las primeras dos columnas con data innecesaria
df.drop(df.iloc[:, [0,1]], axis=1, inplace=True)

#  Columnas con valores categoricos pueden ser del tipo 'object' o 'int64'
categorical_indexes = [0, 1, 3, 4] + list(range(6,20))
df.iloc[:, categorical_indexes] = df.iloc[:, categorical_indexes].astype('category')

# Como el porcentaje de los calores vacios es menor al 1% podriamos eliminar o reemplaxar por la mediana estos valores
df = df.dropna()

In [5]:
# Dividir la data en numerica y categorica
numerical_columns = [c for c in df.columns if df[c].dtype.name != 'category']
numerical_columns.remove('satisfaction')
categorical_columns = [c for c in df.columns if df[c].dtype.name == 'category']
df_describe = df.describe(include = ['category'])

# Divir las columnas categoricas en binarias y no binarias
binary_columns = [c for c in categorical_columns if df_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if df_describe[c]['unique'] > 2]

In [6]:
# Codificación
original_df = df[binary_columns].copy()
df[binary_columns] = df[binary_columns].astype('category').apply(lambda x: x.cat.codes)
df[binary_columns] = df[binary_columns].astype('category')

In [7]:
df_nonbinary = pd.get_dummies(df[nonbinary_columns])
df_numerical = df[numerical_columns]

In [8]:
df_nonbinary = pd.get_dummies(df[nonbinary_columns])
df_numerical = df[numerical_columns]
df_numerical = (df_numerical - df_numerical.mean(axis = 0))/df_numerical.std(axis = 0)
df_final = pd.concat((df_numerical, df_nonbinary, df[binary_columns], df['satisfaction']), axis = 1)

In [9]:
columns_now = df_final.columns.to_list()
for column in columns_all["Columns"]:
    if column not in columns_now:
       df_final[column] = 0
    
columns = columns_all["Columns"].to_list()
df_final = df_final[columns]
df_final = df_final.drop(['satisfaction'],axis=1)

In [10]:
df_final.to_csv("../data/processed/data_score.csv", index=False)

### Scipt 6: Código de Scoring (Automatización)

In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../data/processed/data_score.csv")
columns_all = pd.read_csv("../data/processed/columns_train.csv")

In [5]:
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [6]:
scores=model.predict(df).reshape(-1,1)

In [7]:
# Exportamos el resultado del modelo para cargarlo en el Feature Store o Data Mart de Modelos
# Le asignamos nombres a las columnas
df_score = pd.DataFrame(scores, columns=['PREDICT'])
# Exportamos la solucion
df_score.to_csv('../data/scores/final_score.csv')