In [1]:

from arquivo_preprocessado import preprocessing
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings ("ignore")


In [2]:
X_train , X_test , y_train, y_test = preprocessing()

In [3]:
X_train.head()

Unnamed: 0,Estimated_Insects_Count,Number_Doses_Week,Number_Weeks_Used,Number_Weeks_Quit,Crop_Type_0,Crop_Type_1,Soil_Type_0,Soil_Type_1,Pesticide_Use_Category_1,Pesticide_Use_Category_2,Pesticide_Use_Category_3,Season_1,Season_2,Season_3
68355,625,5,29.0,2,False,True,True,False,False,True,False,True,False,False
79353,342,20,27.0,4,True,False,True,False,False,True,False,True,False,False
70521,2016,20,35.0,7,True,False,False,True,False,True,False,True,False,False
34921,1786,60,17.0,29,True,False,False,True,False,True,False,False,True,False
49121,531,30,15.0,13,True,False,True,False,False,True,False,True,False,False


In [4]:
def normalizacao(X_train, X_test):
    scaler = StandardScaler()
    scaler_fit = scaler.fit(X_train)
    X_train_norm = scaler_fit.transform(X_train)
    X_test_norm = scaler_fit.transform(X_test)
    return X_train_norm , X_test_norm, scaler_fit

In [5]:
X_train_norm ,X_test_norm, scaler = normalizacao(X_train, X_test)

In [6]:
X_train_norm

array([[-0.91364482, -1.34094047,  0.03172147, ...,  1.51384787,
        -0.99183125, -0.50048421],
       [-1.24661287, -0.3747277 , -0.12964793, ...,  1.51384787,
        -0.99183125, -0.50048421],
       [ 0.72295785, -0.3747277 ,  0.51582967, ...,  1.51384787,
        -0.99183125, -0.50048421],
       ...,
       [ 0.3252787 , -0.3747277 , -1.01717964, ..., -0.66056836,
        -0.99183125,  1.99806502],
       [-0.72068807,  0.91355599, -0.37170204, ..., -0.66056836,
         1.00823602, -0.50048421],
       [ 0.3252787 ,  0.91355599, -0.04896323, ..., -0.66056836,
         1.00823602, -0.50048421]])

In [12]:
def selecao(X_train , X_test , Y_train , Y_test): 
    X_rank = np.concatenate([X_train, X_test])
    Y_rank = np.concatenate([Y_train, Y_test])
    for model in [RandomForestClassifier,GradientBoostingClassifier,CatBoostClassifier]:
      cls = model()
      kfold = KFold(n_splits=5,shuffle=True, random_state=7 )
      s = cross_val_score(cls, X_rank, Y_rank, scoring="accuracy", cv=kfold)
      print(f"{model.__name__:22} Score: "f"{s.mean():.3f} STD: {s.std():.2f}")

In [13]:
#X_train_dfnorm = pd.DataFrame(X_train_norm,columns=X_train.columns)
print(" Seleção de modelos:")
selecao (X_train_norm, X_test_norm, y_train, y_test)

 Seleção de modelos:
RandomForestClassifier Score: 0.823 STD: 0.00
GradientBoostingClassifier Score: 0.846 STD: 0.00
Learning rate set to 0.097814
0:	learn: 0.9820539	total: 33.3ms	remaining: 33.3s
1:	learn: 0.8939087	total: 66.4ms	remaining: 33.1s
2:	learn: 0.8240310	total: 97.8ms	remaining: 32.5s
3:	learn: 0.7666857	total: 126ms	remaining: 31.5s
4:	learn: 0.7197000	total: 153ms	remaining: 30.4s
5:	learn: 0.6805368	total: 182ms	remaining: 30.2s
6:	learn: 0.6474173	total: 216ms	remaining: 30.6s
7:	learn: 0.6193430	total: 245ms	remaining: 30.3s
8:	learn: 0.5953737	total: 270ms	remaining: 29.7s
9:	learn: 0.5747440	total: 297ms	remaining: 29.4s
10:	learn: 0.5569482	total: 324ms	remaining: 29.1s
11:	learn: 0.5415919	total: 349ms	remaining: 28.7s
12:	learn: 0.5279362	total: 375ms	remaining: 28.5s
13:	learn: 0.5162603	total: 402ms	remaining: 28.3s
14:	learn: 0.5060282	total: 432ms	remaining: 28.4s
15:	learn: 0.4968307	total: 467ms	remaining: 28.7s
16:	learn: 0.4883851	total: 497ms	remaining:

In [37]:
def modelo (x_train , y_train):
   param_grid = {
    'random_state': [7],
    'depth': [10],
    'learning_rate': [0.1],
    'leaf_estimation_iterations': [10],
    'iterations': [500],
   }
   cat = CatBoostClassifier()
   grid_search = GridSearchCV(estimator = cat, param_grid = param_grid, 
                           cv = 5, n_jobs = -1, verbose = 3,return_train_score=True, scoring='accuracy')
   grid_search.fit(x_train, y_train)
   return grid_search

In [25]:
y_train.shape

(63886, 1)

In [24]:
X_train_norm.shape

(63886, 14)

In [38]:
modelo = modelo (X_train_norm,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


KeyboardInterrupt: 

In [31]:
print(" Resultados Grid Search " )
print("\n O melhor estimador:\n", modelo.best_estimator_)
print("\n A melhor pontuação:\n", modelo.best_score_)
print("\n Os melhores parâmetros:\n", modelo.best_params_)

 Resultados Grid Search 

 O melhor estimador:
 <catboost.core.CatBoostClassifier object at 0x0000027F82761150>

 A melhor pontuação:
 0.8460852428022536

 Os melhores parâmetros:
 {'depth': 10, 'iterations': 100, 'learning_rate': 0.1, 'random_state': 7}
