In [None]:
import missingno as msno
from funciones.preprocessing import *
from funciones.evaluation import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
import warnings

warnings.filterwarnings('ignore')
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 500)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
data = pd.read_csv('')
data.head(3).append(data.tail(3))

In [None]:
data.describe()

In [None]:
data.describe(exclude='number')

In [None]:
msno.bar(data)

## Construccion del pipeline

In [None]:
global_transformations = Pipeline([('selector', ColumnsSelector()),
                                   ('convert_dtypes', ConvertDtypes())])

numerical_transformations = Pipeline([('selector', ColumnsSelector(variables='numerical')),
                                      ('scaler', StandardScaler()),
                                      ('dataframe', GetDataFrame(variables='numerical'))])

categorical_transformations = Pipeline([('selector', ColumnsSelector(variables='categorical')),
                                 ('get_dummies', GetDataFrame())])

preprocessing = Pipeline([('general', global_transformations),
                          ('features', FeatureUnion([
                              ('numerical', numerical_transformations),
                              ('categorical', categorical_transformations)
                          ]),
                           ('dataframe', GetDataFrame()))])

### Construccion de los modelos

In [None]:
label = data.pop('var_rpta')

X = data
y = label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(f'El conjunto de datos de entrenamiento, tiene {X_train.shape[0]} observaciones, y {X_train.shape[1]} variables. \n')
print(f'El conjunto de datos de prueba, tiene {X_test.shape[0]} observaciones, y {X_test.shape[1]} variables.')

#### Regresión Logística

In [None]:
lr = Pipeline([('preprocessing', preprocessing),
               ('estimator', LogisticRegressionCV(cv=5, random_state=42))]).fit(X_train, y_train)
y_pred = lr.predict(X_test)

save_model(lr, filename='../modelos/logistic_regression.pkl')

generate_report(y_test, y_pred)

In [None]:
metrics_summary(y_test, y_pred)
confusion_matrix(y_test, y_pred)

#### Random Forest

In [None]:
rf = Pipeline([('preprocessing', preprocessing),
               ('estimator', RandomForestClassifier(n_estimators=150, max_depth=5,
                                                    max_features='auto', bootstrap=True,
                                                    oob_score=True, random_state=42))]).fit(X_train, y_train)
y_pred = rf.predict(X_test)

save_model(rf, filename='../modelos/random_forest.pkl')

generate_report(y_test, y_pred)

In [None]:
metrics_summary(y_test, y_pred)
confusion_matrix(y_test, y_pred)

#### Gradient Boosting

In [None]:
gb = Pipeline([('preprocessing', preprocessing),
               ('estimator', GradientBoostingClassifier(learning_rate=0.09, n_estimators=150,
                                                        subsample=0.7, max_depth=5, random_state=42))])
y_pred = gb.predict(X_test)

save_model(gb, filename='../modelos/gradient_boosting.pkl')

generate_report(y_test, y_pred)

In [None]:
metrics_summary(y_test, y_pred)
confusion_matrix(y_test, y_pred)
