In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from category_encoders import TargetEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df1 = pd.read_csv('data/isvalytas.csv')
df = df1.sample(1000000, random_state=1)

In [3]:
exit_status_map = {'completed': 1, 'stopped': 0}
df['recepto_statusas'] = df['recepto_statusas'].map(exit_status_map)

In [4]:
y = df.pop('recepto_statusas')
X = df

In [5]:
seed = 50  # so that the result is reproducible

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.333, random_state = seed)

In [6]:
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [7]:
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('lab_enc', TargetEncoder())
    ])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [15]:
model = RandomForestClassifier(max_depth=70, max_features=20, max_leaf_nodes=9, min_samples_split=3,
                               n_estimators=50, random_state=0)

In [16]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 0.09763063063063063


In [17]:
val_accuracy = accuracy_score(y_test, preds)
val_roc_auc = roc_auc_score(y_test, preds)
val_confusion_matrix = confusion_matrix(y_test, preds)
print(f'Accuracy Score: {val_accuracy}')
print(f'ROC AUC Score: {val_roc_auc}')
print(f'Confusion Matrix: \n{val_confusion_matrix}')

Accuracy Score: 0.9023693693693694
ROC AUC Score: 0.5785232840849619
Confusion Matrix: 
[[  6019  31453]
 [  1058 294470]]


In [13]:
param_grid = {'model__n_estimators': [30,45,50],
          "model__max_depth": [45,50,70,90],
          "model__max_features": [10,20,30],
          "model__min_samples_split": [1,2,3,5,10],
          "model__min_samples_leaf": [1,3,5,10]}

In [14]:
random_search = RandomizedSearchCV(pipeline, param_grid)
random_search.fit(X_train, y_train)
print(random_search.best_estimator_)

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/justas/PycharmProjects/projektas/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/justas/PycharmProjects/projektas/venv/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/justas/PycharmProjects/projektas/venv/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/Users/justas/PycharmProjects/projektas/

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['Unnamed: 0.1',
                                                   'recepto_ketv',
                                                   'dozuociu_sk',
                                                   'dozuociu_sk_vnt_id',
                                                   'vart_trukme_d',
                                                   'recepto_galiojimas_d',
                                                   'kompens_poz',
                                                   'Unnamed: 0']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                               