In [None]:
!pip install pycaret --ignore-installed llvmlite

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pycaret
from pycaret.classification import create_model
from pycaret.classification import blend_models
from pycaret.classification import finalize_model
from pycaret.classification import predict_model 

import os
#llvmlite

In [None]:
file_sub = '/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv'
fale_train = '/kaggle/input/tabular-playground-series-nov-2021/train.csv'
file_test = '/kaggle/input/tabular-playground-series-nov-2021/test.csv'
target = 'target'
column_id = 'id'
gera_graficos = True

In [None]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props
def sample_submission(model, X_test):
    resultado = model.predict_proba(X_test)
    solution = pd.read_csv(file_sub)
    solution[['target_0', target]] = resultado
    solution[['id',target]].to_csv('submission.csv', index=False)

In [None]:
data = pd.read_csv(fale_train)
data = reduce_mem_usage(data)

In [None]:
data = data.drop(columns=column_id)

In [None]:
columns = data.columns.to_list()
columns.pop(-1)

In [None]:
print(columns)

In [None]:
from pycaret.classification import setup
experimento = setup(data = data, target = target, session_id=1, normalize=True,feature_selection=True, polynomial_features=True, ignore_low_variance=True, numeric_features=columns, train_size=0.4,silent=True, use_gpu=True,fold = 3, fold_shuffle=True, create_clusters=False,remove_outliers=True, outliers_threshold=0.01, cluster_iter=4)

In [None]:
from pycaret.classification import compare_models
modelos = compare_models(sort = 'AUC', fold = 2, n_select=5, exclude=['gbc', 'ada', 'svm', 'ridge'])

In [None]:
best_models = []

In [None]:
lr = create_model('lr', cross_validation=False)

In [None]:
catboost = create_model('catboost', cross_validation=False)

In [None]:
lightgbm = create_model('lightgbm', cross_validation=False)

In [None]:
xgboost = create_model('xgboost', cross_validation=False)

In [None]:
lr = pycaret.classification.tune_model(lr, optimize='AUC',n_iter=50)
best_models.append(lr)

In [None]:
args_catboost = {'learning_rate': np.arange(0.01, 0.1, 0.005).tolist(),
                    'n_estimators': [i for i in range(100,1300)],
                    'max_depth': [i for i in range(3,10)],
                    'bootstrap_type': [ 'Poisson'],
                    'subsample': np.arange(0.5, 1, 0.05).tolist(),
                    #'num_leaves': [i for i in range(31,100)],
                    'min_child_samples' :  [i for i in range(10,200)],
                    #'colsample_bylevel': np.arange(0.5, 1 , 0.05).tolist(),
                    #'num_parallel_tree': [i for i in range(1,5)]                        
       }

catboost = pycaret.classification.tune_model(catboost, optimize='AUC',n_iter=50, custom_grid=args_catboost)
best_models.append(catboost)

In [None]:
args_lightgbm = {'learning_rate': np.arange(0.01, 0.1, 0.005).tolist(),
                    'n_estimators': [i for i in range(100,1300)],
                    'max_depth': [i for i in range(3,10)],
                    'subsample': np.arange(0.5, 1, 0.05).tolist(),
                    'num_leaves': [i for i in range(31,100)],
                    'min_child_samples' :  [i for i in range(10,200)],
                    'colsample_bytree': np.arange(0.5, 1 , 0.05).tolist(),
                    #'num_parallel_tree': [i for i in range(1,5)]                       
              }

lightgbm = pycaret.classification.tune_model(lightgbm, optimize='AUC',n_iter=50, custom_grid=args_lightgbm)
best_models.append(lightgbm)

In [None]:
args_xgboost = {'learning_rate': np.arange(0.01, 0.1, 0.005).tolist(),
                        'n_estimators':[i for i in range(200,1300)],
                        'max_depht': [i for i in range(3,8)],
                        'subsample': np.arange(0.5, 1.01, 0.05).tolist(),
                        'colsample_bynode': np.arange(0.5, 1.01 , 0.05).tolist(),
                        'num_parallel_tree': [i for i in range(1,5)]
           }


xgboost = pycaret.classification.tune_model(xgboost, optimize='AUC',n_iter=50,  custom_grid=args_xgboost)
best_models.append(xgboost)

In [None]:
print(catboost.get_params())

In [None]:
print(xgboost.get_params())

In [None]:
print(lightgbm.get_params())

In [None]:
blender = blend_models(best_models, optimize='AUC')

In [None]:
final_blender = finalize_model(blender)

In [None]:
data_test = pd.read_csv(file_test)
data_test = reduce_mem_usage(data_test)
data_test = data_test.drop(columns=column_id)

In [None]:
pred_data = predict_model(final_blender, data = data_test, probability_threshold=0.5)

In [None]:
data_sub = pd.read_csv(file_sub)

In [None]:
def processa_proba(line):
    if line.Label == 1:
        new_score = line.Score
    else:
        new_score = 1 - line.Score
    return new_score

In [None]:
pred_data['n_score'] = pred_data.apply(processa_proba, axis=1)

In [None]:
data_sub[target] = pred_data.n_score
data_sub.to_csv('submission.csv', index=False)