## Importing libs and setting plotting parameters

In [None]:
import pickle
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mp
import numpy as np
import pandas as pd

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.ml_functions import *
from utils.process_data import *

mp.style.use('ggplot')
mp.rcParams['font.family'] = "serif"
mp.rcParams['font.size'] = 20
sns.set(style="darkgrid", font_scale=1.5)

In [None]:
df_train = pd.read_parquet("../data/raw/train.parquet")
df_test = pd.read_parquet("../data/raw/test.parquet")

In [None]:
df_train.info()

In [None]:
df_train.head()

## Changing types of train data

## 0. Data Cleansy

### Following the directions given in the clarification email sent by topminds stone (it is available in body text of the file `clarification_email.txt`), the predictions must be made in the 90th day after the loan. So, to avoid target leakage, all rows with information about the days after the 90th day are deleted.

In [None]:
def data_cleanse(df):
    #deleting rows in the raw data
    df.drop(index=df.index[df['dias_pos_desembolso'] > 89], inplace=True)
    df.reset_index(drop=True, inplace=True) 
    return df

df_train = data_cleanse(df_train)
df_test = data_cleanse(df_test)  
    
# #deleting rows in the raw data
# df.drop(index=df.index[df['dias_pos_desembolso'] > 89], inplace=True)
# df.reset_index(drop=True, inplace=True)

# #deleting rows in dict_loans_timeseries_attributes
# for index, tmp_df_timeseries in dict_dfs_timeseries.items():
#     tmp_df_timeseries.drop(index=tmp_df_timeseries.index[tmp_df_timeseries['dias_pos_desembolso'] > 89], inplace=True)
#     tmp_df_timeseries.reset_index(drop=True, inplace=True)    

In [None]:
#setting proper types
dtypes = {
    'id': int,
    'dias_pos_desembolso': int,
}

df_test = df_test.astype(dtypes) 

df_train = df_train.astype(dtypes) 
df_train['y'] = df_train['y'].astype(int)

In [None]:
# Extracting a daframe with just the constant attributes
def  from_sorted_df_get_constant_attributes(df, constant_attributes, id_column_name="id"
):

    unique_index = df[id_column_name].unique()
    unique_index.sort()

    df_constant_attributes = pd.DataFrame(
        columns=constant_attributes, index=unique_index.astype(int)
    )

    company_ids_in_df = df[id_column_name].to_numpy()

    for idx in unique_index:
        idx_boolean_list = company_ids_in_df == idx
        tmp_df = df.loc[idx_boolean_list]
        df_constant_attributes.loc[idx] = tmp_df[constant_attributes].iloc[-1]

    return df_constant_attributes


constant_attributes = ['desembolso', 'vencimento', 'valor_emprestado', 'pgto_diario_esperado', 'subsegmento', 'y']

df_train_constant = from_sorted_df_get_constant_attributes(df_train, constant_attributes)
df_test_constant = from_sorted_df_get_constant_attributes(df_test, constant_attributes[:-1])

In [None]:
df_train_constant.head()

In [None]:
# From datetime to days
def from_date_to_days(df):
    df["desembolso"] = pd.to_datetime(
        df["desembolso"], format="%Y-%m-%d")
    df["vencimento"] = pd.to_datetime(
        df["vencimento"], format="%Y-%m-%d")
    
    df['duracao_esperada'] = (df.vencimento - df.desembolso).dt.days
    
    df.drop(columns=['desembolso', 'vencimento'], inplace=True)
    
    return df

df_train_constant = from_date_to_days(df_train_constant)
df_test_constant = from_date_to_days(df_test_constant)

In [None]:
df_train_constant.head()

In [None]:
drop_columns = ['desembolso', 'vencimento', 'valor_emprestado', 'pgto_diario_esperado', 'dia']

df_train.drop(columns = drop_columns, inplace=True)
df_test.drop(columns = drop_columns, inplace=True)

In [None]:
df_train.head()

## 1. Data Normalization

### The following steps of Data Normalization will use the Timeseries attributes ( `divida_total`, `divida_principal`, `pagamento_diario`, `amortizacao_principal_diario`, `transacionado` ). Thus to enforce some comparative bias between these value, they will be normalized according to `valor_emprestado`.

In [None]:
def data_normalization(df, df_constant):
    # Getting the numpy array to improve performance
    timeseries_attributes = ['divida_total', 'divida_principal', 'pagamento_diario', 'amortizacao_principal_diario', 'transacionado']
    df_timeseries_array = df[timeseries_attributes].values

    #this loop takes approx. 1.5 min to run
    for loan_index in df_constant.index:
        loan_value = df_constant.loc[loan_index, 'valor_emprestado']
        loan_index_in_array = df.index[df['id'].isin([loan_index])]

        df_timeseries_array[loan_index_in_array, :] /= loan_value

    df[timeseries_attributes] = df_timeseries_array
    
    return df
    
df_train_norm = data_normalization(df_train, df_train_constant)
df_test_norm = data_normalization(df_test, df_test_constant)

In [None]:
df_test_norm.head()

## 2. Feature Engineering

### 2.1 Creating Attrubutes

### Here I am creating 8 variables by aggregating the timeseries attributes into a single dataframe:
#### `pagamento_diario_total` is the summation of the daily registered payment
#### `amortizacao_diario_total` is the summation of the daily registered amortization
#### `transacao_diaria_total` is the summation of the daily registered transactions
#### `divida_total_menos_principal_area` is the approximated area of curve made by the data points of divida_total - divida_principal
#### `divida_total_variacao`  is the inital loan value - the value at the day of prediction
#### `angulo_esperado_decaimento_divida` is the angle of the line that fits the expected debt variation
#### `angulo_fitado_decaimento_divida`is the angle of the line that fits the main debt variation (counted from y axis)
#### `score_do_fit`  is the score of the fit of the line that fits the main debt variation (counted from y axis)

In [None]:
from sklearn.linear_model import LinearRegression

new_attributes = ['pagamento_diario_total',
                  'amortizacao_diario_total',
                  'transacao_diaria_total',
                  'divida_total_menos_principal_area',
                  'divida_total_variacao',
                  'angulo_esperado_decaimento_divida',
                  'angulo_fittado_decaimento_divida',
                  'score_do_fit']

def aggregate_timeseries(df_total, df_constant):

    timeseries_attributes = ['divida_total', 'divida_principal',
        'pagamento_diario', 'amortizacao_principal_diario', 'transacionado']

    df = df_total.copy()
    df_constant_new = df_constant.copy()
    
    df_values = df[['id'] + timeseries_attributes].values

    linear_regression = LinearRegression()

    df_constant_new[new_attributes] = 0

    for loan_index in df_constant_new.index:
        tmp_loan_constant = df_constant_new.loc[loan_index]
        df_values_index = df_values[:,0] == loan_index
        tmp_loan_timeseries = df_values[df_values_index, 1:]

        tmp_summed_daily_payment = sum(tmp_loan_timeseries[:, 2])
        tmp_summed_daily_amortization = sum(tmp_loan_timeseries[:, 3])
        tmp_summed_daily_transaction = sum(tmp_loan_timeseries[:, 4])

        tmp_total_minus_main_debt_area = np.trapz(
            tmp_loan_timeseries[:, 0]-tmp_loan_timeseries[:, 1])

        # if positive the debt decrease
        tmp_total_debt_variation = tmp_loan_timeseries[0, 0] - tmp_loan_timeseries[-1, 0]

        tmp_expected_slope_of_debt_payment = np.arctan(
            (tmp_loan_timeseries[:, 1].max() * 100) / tmp_loan_constant['duracao_esperada'])
        tmp_expected_slope_of_debt_payment += np.pi / 2

        y = tmp_loan_timeseries[:, 0] * 100
        X = np.arange(0, y.size).reshape(y.size, 1)
        reg = linear_regression.fit(X, y)

        tmp_tg_theta = (reg.intercept_ - (reg.intercept_ +
                    reg.coef_[0]*X.flatten().max())) / (X.flatten().max())

        tmp_fitted_slope_of_debt_payment = np.arctan(tmp_tg_theta)
        tmp_fitted_slope_of_debt_payment += np.pi / 2

        tmp_linear_regression_score = reg.score(X, y)

        df_constant_new.loc[loan_index, new_attributes] =[tmp_summed_daily_payment,
                                                          tmp_summed_daily_amortization,
                                                          tmp_summed_daily_transaction,
                                                          tmp_total_minus_main_debt_area,
                                                          tmp_total_debt_variation,
                                                          tmp_expected_slope_of_debt_payment,
                                                          tmp_fitted_slope_of_debt_payment,
                                                          tmp_linear_regression_score]


    return df_constant_new


df_train_new = aggregate_timeseries(df_train_norm, df_train_constant)
df_test_new = aggregate_timeseries(df_test_norm, df_test_constant)

In [None]:
df_train_new.describe()

In [None]:
fig, ax = plt.subplots(nrows=4, ncols=2, figsize=(16,18))
ax = ax.flatten()

new_attributes = ['pagamento_diario_total',
                  'amortizacao_diario_total',
                  'transacao_diaria_total',
                  'divida_total_menos_principal_area',
                  'divida_total_variacao',
                  'angulo_esperado_decaimento_divida',
                  'angulo_fittado_decaimento_divida',
                  'score_do_fit']

for i, attribute in enumerate(new_attributes):
    sns.histplot(ax=ax[i], data=df_train_new, x=attribute)

#### From the graphs above, clearly some of the new attributes have a boolean behaviour, other have a bin type characteristic, and others are too much skewed. They will be transformed accordinly

In [None]:
df_train_new

### 2.2 Mapping specifc subsegments into broader comercial sectors

In [None]:
def subsegments_to_segment(df):
    sectors = {
        'Alimentacao e Bebida': ['Comércio de Alimentos', 'Alimentação Rápida', 'Bares e Restaurantes', 'Comércio de Bebidas', 'Supermercados'],
        'Moda e Esports': ['Vestuário', 'Calçados', 'Artigos Esportivos', 'Acessórios, Bolsas e Bijuterias'],
        'Industria, Construcao e Veiculos': ['Materiais de Construção', 'Autopeças e Acessórios', 'Comércio de Veículos', 'Gás GLP, Lubrificantes e Outros', 'Reformas e Obras em Geral', 'Postos de Gasolina', 'Estacionamentos e Lava-rápidos', 'Equipamentos de Uso Comercial e Industrial', 'Locação de Veículos'],
        'Servicos': ['Oficinas Automotivas', 'Salão de Beleza', 'Conserto de Produtos e Reparos de Peças', 'Outros Serviços - Outros', 'Delivery e Entrega', 'Telecomunicações', 'Academias e Clubes', 'Clinicas de Estética e Massagem', 'Associação', 'Cias Aéreas', 'Jornais e Revistas - Conteúdo Físico', 'Serviços Corporativos - Outros', 'Hotéis / Resorts / Pousadas / Motéis', 'Festas e Eventos', 'Gráfica, Impressão e Xerox', 'Entretenimento e Turismo', 'Consultorias', 'Logística e Mobilidade - Outros', 'Marketing', 'Serviços Imobiliários', 'Segurança', 'Táxi/Carona', 'Paisagismo e Jardinagem', 'Serviços Financeiros', 'Casa e Decoração - Outros'],
        'Saude': ['Óticas e Óculos', 'Drogarias e Farmácias', 'Outros Serviços de Saúde', 'Odontologia', 'Veterinários', 'Médicina', 'Outros Produtos de Saúde e Beleza', 'Hospitais e Laboratórios'],
        'Comercio': ['Móveis', 'Outros Comércios - Outros', 'Eletrodomésticos', 'Armarinhos e Tecido', 'Tabacaria', 'Cama, Mesa e Banho', 'Cosméticos e Perfumaria', 'Loja de Presentes','Lojas de Departamento', 'Jogos e Brinquedos Físicos', 'Joalherias, Relojoarias e Pratarias', 'Floricultura', 'Petshops', 'Artigos Religiosos e Antiguidades', 'Artigos de Decoração', 'Instrumentos Musicais, CDs, DVDs e Outros'],
        'Educacao': ['Extracurriculares, Autoescola e Outros', 'Ensino Básico', 'Livrarias e Papelarias', 'Ensino Superior e Técnico'],
        'Informatica': ['Eletrônicos', 'Softwares e Eletrônica Integrada']
    }

    # checking if all subsegments have been considered. The code below should only generate an N / A text output.
    flat_list = [item for sublist in list(sectors.values()) for item in sublist]
    for segment in df['subsegmento'].unique():
        if not flat_list.count(segment):
            print(segment)

    # function `return_key_if_contains_value` in process_data.py archive
    df['segmento'] = df['subsegmento'].map(lambda segment: return_key_if_contains_value(sectors, segment))
    df['segmento'] = df['segmento'].fillna('N/A')
    df['segmento'] = df['segmento'].astype('category')

    df.drop(columns=['subsegmento'], inplace=True)
    
    return df

df_train_transformed = subsegments_to_segment(df_train_new)
df_test_transformed = subsegments_to_segment(df_test_new)

### 2.3 Binning pagamento_diario_total,  amortizacao_diario_total, transacao_diaria_total, divida_total_variacao, angulo_esperado_decaimento_divida, score_do_fit

In [None]:
df_train_transformed

In [None]:
binned_columns = ['pagamento_diario_total', 'amortizacao_diario_total',
                     'transacao_diaria_total', 'divida_total_variacao', 'angulo_esperado_decaimento_divida', 'score_do_fit']

def binning_features(df):

    df_constant_new = df.copy()
    df_constant_new['pagamento_diario_total'] = pd.qcut(df_constant_new['pagamento_diario_total'], 2, labels=[
                                              'pagamento_diario_alto', 'pagamento_diario_baixo'])
    df_constant_new['amortizacao_diario_total'] = pd.qcut(df_constant_new['amortizacao_diario_total'], 2, labels=[
                                                'amortizacao_diario_alto', 'amortizacao_diario_baixo'])
    df_constant_new['transacao_diaria_total'] = pd.qcut(df_constant_new['transacao_diaria_total'], 2, labels=[
                                              'transacao_diaria_alto', 'transacao_diaria_baixo'])
    df_constant_new['divida_total_variacao'] = pd.qcut(df_constant_new['divida_total_variacao'], 2, labels=[
                                             'divida_total_alto', 'divida_total_baixo'])
    df_constant_new['angulo_esperado_decaimento_divida'] = pd.qcut(df_constant_new['angulo_esperado_decaimento_divida'], 2, labels=[
                                                         'angulo_esperado_alto', 'angulo_esperado_baixo'])
    df_constant_new['score_do_fit'] = pd.qcut(df_constant_new['score_do_fit'], 2, labels=[
                                    'score_do_fit_alto', 'score_do_fit_baixo'])
    
    return df_constant_new

df_train_binned = binning_features(df_train_transformed)
df_test_binned = binning_features(df_test_transformed)

fig, ax = plt.subplots(nrows=len(binned_columns), ncols=2, figsize=(16, 16))

for i in range(len(binned_columns)): 
    sns.histplot(ax=ax[i,0], data=df_train_transformed,
                 x=binned_columns[i], hue='y')
    sns.countplot(ax=ax[i,1], data=df_train_binned, x=binned_columns[i], hue='y')
    ax[i, 0].get_legend().remove()
    
fig.tight_layout(pad=2)

In [None]:
df_train_binned

In [None]:
df_constant_transformed.head()

### 2.2 Power Transform - divida_total_menos_principal_area

In [None]:
from sklearn.preprocessing import PowerTransformer

def power_transformers(df):
    df_constant_transformed = df.copy()
    df_constant_transformed['divida_total_menos_principal_area_yeojohnson'] = PowerTransformer().fit_transform(df_constant_transformed['divida_total_menos_principal_area'].values.reshape(-1,1))
    df_constant_transformed['valor_emprestado_yeojohnson'] = PowerTransformer().fit_transform(df_constant_transformed['valor_emprestado'].values.reshape(-1,1))
    df_constant_transformed['pgto_diario_esperado_yeojohnson'] = PowerTransformer().fit_transform(df_constant_transformed['pgto_diario_esperado'].values.reshape(-1,1))
    df_constant_transformed.drop(columns=['divida_total_menos_principal_area', 'valor_emprestado', 'pgto_diario_esperado'], inplace=True)
    return df_constant_transformed

df_train_powered = power_transformers(df_train_binned)
df_test_powered = power_transformers(df_train_binned)

In [None]:
fig,ax = plt.subplots(nrows=1, ncols=2, figsize=(16,4))
sns.histplot(ax=ax[0], data=df_train_binned['divida_total_menos_principal_area'])
sns.histplot(ax=ax[1], data=df_train_powered['divida_total_menos_principal_area_yeojohnson'])

### 2.3 Power Transform - valor_emprestado

In [None]:
fig,ax = plt.subplots(nrows=1, ncols=2, figsize=(16,4))
sns.histplot(ax=ax[0], data=df_train_binned['valor_emprestado'])
sns.histplot(ax=ax[1], data=df_train_powered['valor_emprestado_yeojohnson'])

### 2.4 Power Transform - pgto_diario_esperado

In [None]:
fig,ax = plt.subplots(nrows=1, ncols=2, figsize=(16,4))
sns.histplot(ax=ax[0], data=df_train_binned['pgto_diario_esperado'])
sns.histplot(ax=ax[1], data=df_train_powered['pgto_diario_esperado_yeojohnson'])

In [None]:
# Checking relation between attributes
numerical_cols = [cname for cname in df_train_powered.columns if df_train_powered[cname].dtype in ['int64', 'float64']]

cov_df_loan = df_train_powered[numerical_cols].corr()

fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(ax=ax, data=cov_df_loan, cmap='coolwarm_r')

#### pgto_diario_esperado and valor_emprestado seem to be highly correlated...

## 3. Encoding Categorical Attributes

In [None]:
categorical_cols = [cname for cname in df_train_powered.columns if df_train_powered[cname].dtype.name in ['category']]
categorical_cols

In [None]:
for categorical_col in categorical_cols: 
    df_train_powered = pd.concat([df_train_powered, pd.get_dummies(df_train_powered[categorical_col])], axis=1)
    df_test_powered = pd.concat([df_test_powered, pd.get_dummies(df_test_powered[categorical_col])], axis=1)
    
    df_train_powered.drop(columns=categorical_col, inplace=True)
    df_test_powered.drop(columns=categorical_col, inplace=True)    

In [None]:
df_train_powered.columns

## 4. Scalling

In [None]:
df_train_powered
df_test_powered

df_train_scalled = df_train_powered.copy()
df_test_scalled = df_test_powered.copy()

# StandardScale angulo_fittado_decaimento_divida, divida_total_menos_principal_area_yeojohnson
scalling_attributes = ['angulo_fittado_decaimento_divida', 'divida_total_menos_principal_area_yeojohnson']

In [None]:
from sklearn.preprocessing import StandardScaler 

df_train_scalled['angulo_fittado_decaimento_divida'] = StandardScaler().fit_transform(df_train_scalled['angulo_fittado_decaimento_divida'].values.reshape(-1,1))
df_train_scalled['divida_total_menos_principal_area_yeojohnson'] = StandardScaler().fit_transform(df_train_scalled['divida_total_menos_principal_area_yeojohnson'].values.reshape(-1,1))

df_test_scalled['angulo_fittado_decaimento_divida'] = StandardScaler().fit_transform(df_test_scalled['angulo_fittado_decaimento_divida'].values.reshape(-1,1))
df_test_scalled['divida_total_menos_principal_area_yeojohnson'] = StandardScaler().fit_transform(df_test_scalled['divida_total_menos_principal_area_yeojohnson'].values.reshape(-1,1))

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(18,6))

for i, attribute in enumerate(scalling_attributes):
    sns.histplot(ax=ax[i,0], data=df_train_scalled[attribute])
    sns.histplot(ax=ax[i,1], data=df_train_scalled[attribute])    
    
fig.tight_layout(pad=1)

## 5. Imputing missing values

In [None]:
df_train_scalled.y = df_train_scalled.y.astype(int)

## 6. Dealing with Unbalanced Data

In [None]:
df_sampled = df_train_scalled.copy()

In [None]:
sns.countplot(df_sampled['y'])

plt.title("Unbalanced Data Set");

n_defaults = sum(df_sampled['y'] == 1)
n_not_defaults = sum(df_sampled['y'] == 0)

print(f"{n_defaults} Defaults, {n_not_defaults} not Default.")

## 7. Sampling the original dataset

#### It Is visible that the data set is unbalance. Some sampling tecniques (Undersampling and oversampling) can be applied to avoid some overffiting is classification models due to an unbalanced dataset. But before applying these tecniques, the original dataset must be preserved for testing. So, the models will be fitted with the a dataset that were exposed to some sampling tecnique and, afterwards, it will be tested with the original dataset.

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold

X = df_sampled.drop('y', axis=1)
y = df_sampled['y']

stratified_fold = StratifiedKFold(n_splits=3, random_state=None, shuffle=False)

for train_index, test_index in stratified_fold.split(X, y):
    original_X_train, original_X_test = X.iloc[train_index], X.iloc[test_index]
    original_y_train, original_y_test = y.iloc[train_index], y.iloc[test_index]
    
# Convert to array for the ml model
original_X_train = original_X_train.values
original_X_test = original_X_test.values
original_y_train = original_y_train.values
original_y_test = original_y_test.values

In [None]:
df_sampled.info()

### 7.1 UnderSampling

In [None]:
# shuffling the dataset
df_sampled.sample(n=df_sampled.shape[0], random_state=1)

default_df = df_sampled.loc[df_sampled['y'] == 1]

#number of default loans
default_amount = (df_sampled['y'] == 1).sum()

# select same number of non defaults
non_default_df = df_sampled.loc[df_sampled['y'] == 0][:default_amount]

balanced_df = pd.concat([default_df, non_default_df])

# Shuffle dataframe rows
balanced_df = balanced_df.sample(n=balanced_df.shape[0], random_state=1)

balanced_df.head()

### 8. Ml Models for the UnderSample Database

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


X = balanced_df.drop('y', axis=1)
y = balanced_df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

### Logistic Regression

In [None]:
# Logistic Regression
log_reg_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'max_iter': [500]}

log_grid_search = GridSearchCV(
    LogisticRegression(), log_reg_params,
    refit=True, n_jobs=-1)

log_grid_search.fit(X_train, y_train)
log_reg = log_grid_search.best_estimator_

y_pred = log_grid_search.predict(X_test)
confusion_matrix_plot(y_test, y_pred)

print(f"Recall Score: {recall_score(y_test, y_pred)}")
print(f"Auc Score: {roc_auc_score(y_test, y_pred)}")


plt.tight_layout()

In [None]:
log_reg.predict_proba(X_test)

## Random Forest

In [None]:
%%time
rf_params = {
    "n_estimators": [50, 70, 90, 110], 
    "max_depth": [4, 6, 8, 10, 12], 
    "max_features": [0.1, 0.2, 0.3],
}

rf_grid_search = GridSearchCV(
    RandomForestClassifier(random_state = 1), 
    rf_params, n_jobs=-1
)

rf_grid_search.fit(X_train, y_train)
rf_best = rf_grid_search.best_estimator_

y_pred = rf_best.predict(X_test)
confusion_matrix_plot(y_test, y_pred)

print(f"Best Parameters: {rf_best}")
print()
print(f"Recall Score: {recall_score(y_test, y_pred)}")
print(f"Auc Score: {roc_auc_score(y_test, y_pred)}")

In [None]:
y_pred = rf_best.predict(original_X_train)
confusion_matrix_plot(original_y_train, y_pred)

print(f"Recall Score: {recall_score(original_y_train, y_pred)}")
print(f"Auc Score: {roc_auc_score(original_y_train, y_pred)}")

### USING ORIGINAL TEST DATA WITH RANDOM FOREST

In [None]:
indexes = df_test_scalled.index

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
x_test = imp.fit_transform(df_test_scalled.values)

In [None]:
y_pred = rf_best.predict(x_test)

In [None]:
y_prod = rf_best.predict_proba(x_test)

In [None]:
submission = pd.DataFrame(columns=['id', 'y', 'y_prod'])

In [None]:
submission['id'] = indexes
submission['y'] = y_pred
submission['y_prod'] = y_prod

In [None]:
submission.to_parquet("../submission.parquet")