In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [59]:
import os
import re
import operator
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from collections import defaultdict
from dotenv import find_dotenv, load_dotenv
from category_encoders.target_encoder import TargetEncoder

from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import SelectFromModel, GenericUnivariateSelect

from src.models.train_model import train_xgb_cv
from src.data.clean_dataset import feats_n_unique
from src.data.clean_dataset import remove_new_values
from src.features.preprocess.transform import log_transform
from src.features.pruning.interaction import model_select_features
from src.features.pruning.interaction import get_correlated_features

PROJECT_PATH = '../../'

In [3]:
FOLDER_NAME = os.path.basename(os.getcwd())

In [4]:
FOLDER_PATH = os.path.join(PROJECT_PATH, "data", "processed", FOLDER_NAME)

In [12]:
!ls {FOLDER_PATH}

20190730130326993039_baseline_01_train.csv


## Starting from where the Baseline has ended

In [60]:
train = pd.read_csv(os.path.join(FOLDER_PATH, "20190730130326993039_baseline_01_train.csv"))
test = pd.read_csv(os.path.join(FOLDER_PATH, "20190730130326993039_baseline_01_test.csv"))

In [61]:
train_x = train.drop(['TARGET'], axis=1)
train_y = train['TARGET']

### Get Feature Importances

In [84]:
x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.3)

In [90]:
rf = RandomForestClassifier(n_estimators=60, oob_score=True, n_jobs=-1, random_state=2)

In [85]:
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=True, random_state=2, verbose=0,
                       warm_start=False)

In [86]:
roc_auc_score(y_val, rf.predict_proba(x_val)[:,1])

0.7914517377754657

In [91]:
rf.fit(train_encoded, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=-1,
                       oob_score=True, random_state=2, verbose=0,
                       warm_start=False)

In [92]:
roc_auc_score(y_train, rf.predict_proba(train_encoded)[:,1])

1.0

In [93]:
roc_auc_score(y_val, rf.predict_proba(val_encoded)[:,1])

0.4867978578181231

In [82]:
roc_auc_score(y_val, rf.predict_proba(x_val)[:,1])

0.7958301838410967

In [75]:
feat_imp = pd.DataFrame(list(zip(train_x.columns, rf.feature_importances_)), columns=["feature", "importance"])

### Feature Importance and Unique map

In [76]:
feat_unique_counts = train_x.nunique().reset_index()
feat_unique_counts.columns = ['feature', 'nunique']
feats_imp_unique = pd.merge(feat_imp, feat_unique_counts, on="feature")
feats_imp_unique[(feats_imp_unique['nunique']<30) & (feats_imp_unique['importance']> 0.001)].sort_values(['importance', 'nunique'], ascending=False).head(n=30)

Unnamed: 0,feature,importance,nunique
276,num_var22_hace3,0.009146,19
275,num_var22_hace2,0.008523,22
194,var36,0.00667,5
290,num_meses_var39_vig_ult3,0.00601,4
277,num_var22_ult1,0.005253,18
281,num_meses_var5_ult3,0.004968,4
279,num_med_var22_ult3,0.004408,15
148,num_var35,0.003685,13
89,num_var4,0.003036,8
139,num_var30,0.002407,9


In [94]:
categorical_feats = ['var36', 'num_meses_var5_ult3', 'num_meses_var39_vig_ult3']

### Target Encode

In [77]:
enc = TargetEncoder(cols=categorical_feats)
enc.fit(train_x, train_y)
train_encoded = enc.transform(x_train, y_train)
val_encoded = enc.transform(x_val)

In [None]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)
s_tenc_validation = rf.score(X_test, y_test)
print(f"{s_tenc_validation:.4f} score {rfnnodes(rf):,d} tree nodes and {np.median(rfmaxdepths(rf))} median tree height")