In [18]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:
# Loading and Cleaning Data
colnames = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
            'marital-status', 'occupation', 'relationship', 'race', 'sex', 
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
df = pd.read_csv('adult.csv', names = colnames)
df = df.replace([' ?', ' <=50K.', ' >50K.', ' <=50K', ' >50K'], [np.NaN, 1, 0, 1, 0])
df = df.dropna()
df = df.replace([])

X = df.drop(['income'], axis=1)
y = df['income']
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,1
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,1
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,1
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,1


# PERCENT POSITIVE

In [139]:
percent_pos = ((df['income']==0).sum() / len(df['income'])) * 100
percent_pos

24.78439697492371

In [20]:
# Transform X values into One Hot Encoding for categorical variables and Standardizing for numerical variables
cat = list(X.select_dtypes(['object']).columns)
cont = list(X.select_dtypes(['int64']).columns)

cont_transform = Pipeline(steps=[('scaler', StandardScaler())])

cat_transform = Pipeline(steps=[('categories', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('cont', cont_transform, cont),
                                               ('cat', cat_transform, cat)])
X = pd.DataFrame(preprocessor.fit_transform(X))
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,0.034201,-1.062295,1.128753,0.142888,-0.21878,-0.078120,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.866417,-1.007438,1.128753,-0.146733,-0.21878,-2.326738,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.041455,0.245284,-0.438122,-0.146733,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.093385,0.425853,-1.221559,-0.146733,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.798015,1.407393,1.128753,-0.146733,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,-0.419735,0.525154,1.128753,-0.146733,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45218,0.034201,0.243135,1.128753,-0.146733,-0.21878,-0.411249,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45219,-0.041455,1.753613,1.128753,-0.146733,-0.21878,0.754701,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45220,0.412481,-1.001947,1.128753,0.579985,-0.21878,-0.078120,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# LOGISTIC REGRESSION

In [120]:
num_trials = 5
train_metrics = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])
test_metrics = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])

for i in range(num_trials):
    pipe = Pipeline(steps=[('classifier', LogisticRegression())])

    parameters = [{'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1)],
                   'classifier__solver': ['saga'],
                   'classifier__penalty': ['l1'],
                   'classifier__C': np.logspace(-8,4,13)},
                  {'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1)],
                   'classifier__solver': ['sag', 'saga'],
                   'classifier__penalty': ['none']},
                  {'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1)],
                   'classifier__solver': ['sag', 'saga'],
                   'classifier__penalty': ['l2'],
                   'classifier__C': np.logspace(-8,4,13)}]

    clf = GridSearchCV(pipe, parameters, cv=KFold(n_splits=5), 
                       scoring=['accuracy', 'precision', 'roc_auc', 'f1'], refit=False, verbose=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
    hyperparams = clf.fit(X_train, y_train)
    results = hyperparams.cv_results_['params']
    solution = pd.DataFrame(results)
    
    # ACCURACY
    solution['Accuracy'] = hyperparams.cv_results_['mean_test_accuracy']
    best_accuracy = results[np.argmin(hyperparams.cv_results_['rank_test_accuracy'])]
    accuracy_model = LogisticRegression(penalty = best_accuracy['classifier__penalty'],
                                        C = best_accuracy['classifier__C'],
                                        solver = best_accuracy['classifier__solver'],
                                        max_iter = 5000,
                                        n_jobs = -1)
    accuracy_model.fit(X_train, y_train)
    y_acc_train = accuracy_model.predict(X_train)
    acc_train_score = accuracy_score(y_train, y_acc_train)
    
    y_acc_test = accuracy_model.predict(X_test)
    acc_test_score = accuracy_score(y_test, y_acc_test)
    
    # PRECISION
    solution['Precision'] = hyperparams.cv_results_['mean_test_precision']
    best_precision = results[np.argmin(hyperparams.cv_results_['rank_test_precision'])]
    precision_model = LogisticRegression(penalty = best_precision['classifier__penalty'],
                                         C = best_precision['classifier__C'],
                                         solver = best_precision['classifier__solver'],
                                         max_iter = 5000,
                                         n_jobs = -1)
    precision_model.fit(X_train, y_train)
    y_prec_train = precision_model.predict(X_train)
    prec_train_score = accuracy_score(y_train, y_prec_train)
    
    y_prec_test = precision_model.predict(X_test)
    prec_test_score = accuracy_score(y_test, y_prec_test)
    
    # ROC AUC
    solution['ROC AUC'] = hyperparams.cv_results_['mean_test_roc_auc']
    best_roc_auc = results[np.argmin(hyperparams.cv_results_['rank_test_roc_auc'])]
    roc_model = LogisticRegression(penalty = best_roc_auc['classifier__penalty'],
                                   C = best_roc_auc['classifier__C'],
                                   solver = best_roc_auc['classifier__solver'],
                                   max_iter = 5000,
                                   n_jobs = -1)
    roc_model.fit(X_train, y_train)
    y_roc_train = roc_model.predict(X_train)
    roc_train_score = accuracy_score(y_train, y_roc_train)
    
    y_roc_test = roc_model.predict(X_test)
    roc_test_score = accuracy_score(y_test, y_roc_test)
    
    # F1
    solution['F1'] = hyperparams.cv_results_['mean_test_f1']
    best_f1 = results[np.argmin(hyperparams.cv_results_['rank_test_f1'])]
    f1_model = LogisticRegression(penalty = best_f1['classifier__penalty'],
                                  C = best_f1['classifier__C'],
                                  solver = best_f1['classifier__solver'],
                                  max_iter = 5000,
                                  n_jobs = -1)
    f1_model.fit(X_train, y_train)
    y_f1_train = f1_model.predict(X_train)
    f1_train_score = accuracy_score(y_train, y_f1_train)
    
    y_f1_test = f1_model.predict(X_test)
    f1_test_score = accuracy_score(y_test, y_f1_test)
    
    train_metrics = train_metrics.append({'Accuracy': acc_train_score, 'Precision': prec_train_score, 
                                          'AUC': roc_train_score, 'F1': f1_train_score}, ignore_index=True)
    
    test_metrics = test_metrics.append({'Accuracy': acc_test_score, 'Precision': prec_test_score, 
                                          'AUC': roc_test_score, 'F1': f1_test_score}, ignore_index=True)

Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 205 out of 205 | elapsed: 14.5min finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 205 out of 205 | elapsed: 14.1min finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 205 out of 205 | elapsed: 14.6min finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 205 out of 205 | elapsed: 12.7min finished


Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 205 out of 205 | elapsed: 13.0min finished


In [123]:
train_metrics

Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.8494,0.8548,0.8544,0.8494
1,0.855,0.855,0.8512,0.8512
2,0.866,0.866,0.8626,0.8626
3,0.857,0.8566,0.857,0.857
4,0.85,0.851,0.8502,0.85


In [124]:
print(test_metrics['Accuracy'].mean())
test_metrics

0.8487345233951569


Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.848242,0.84951,0.849635,0.848242
1,0.849088,0.849088,0.84869,0.848715
2,0.847571,0.84777,0.847347,0.847347
3,0.849088,0.849137,0.849088,0.849088
4,0.849684,0.84961,0.849684,0.849684


# RANDOM FOREST CLASSIFIER

In [149]:
num_trials = 5
train_metrics_rf = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])
test_metrics_rf = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])

for i in range(num_trials):
    randomForest = RandomForestClassifier()

    param_grid = {
        'n_estimators': [1024],
        'criterion': ['gini', 'entropy'],
        'max_features': [1,2,4,6,8,12,16,20]}

    clf = GridSearchCV(estimator=randomForest, param_grid=param_grid, cv=KFold(n_splits=5), 
                       scoring=['accuracy', 'precision', 'roc_auc', 'f1'], refit=False)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
    hyperparams = clf.fit(X_train, y_train)
    results = hyperparams.cv_results_['params']
    solution = pd.DataFrame(results)
    
    # ACCURACY
    solution['Accuracy'] = hyperparams.cv_results_['mean_test_accuracy']
    best_accuracy = results[np.argmin(hyperparams.cv_results_['rank_test_accuracy'])]
    accuracy_model = RandomForestClassifier(n_estimators = best_accuracy['n_estimators'],
                                            criterion = best_accuracy['criterion'],
                                            max_features = best_accuracy['max_features'])
    accuracy_model.fit(X_train, y_train)
    y_acc_train = accuracy_model.predict(X_train)
    acc_train_score = accuracy_score(y_train, y_acc_train)
    
    y_acc_test = accuracy_model.predict(X_test)
    acc_test_score = accuracy_score(y_test, y_acc_test)
    
    # PRECISION
    solution['Precision'] = hyperparams.cv_results_['mean_test_precision']
    best_precision = results[np.argmin(hyperparams.cv_results_['rank_test_precision'])]
    precision_model = RandomForestClassifier(n_estimators = best_precision['n_estimators'],
                                             criterion = best_precision['criterion'],
                                             max_features = best_precision['max_features'])
    precision_model.fit(X_train, y_train)
    y_prec_train = precision_model.predict(X_train)
    prec_train_score = accuracy_score(y_train, y_prec_train)
    
    y_prec_test = precision_model.predict(X_test)
    prec_test_score = accuracy_score(y_test, y_prec_test)
    
    # ROC AUC
    solution['ROC AUC'] = hyperparams.cv_results_['mean_test_roc_auc']
    best_roc_auc = results[np.argmin(hyperparams.cv_results_['rank_test_roc_auc'])]
    roc_model = RandomForestClassifier(n_estimators = best_roc_auc['n_estimators'],
                                       criterion = best_roc_auc['criterion'],
                                       max_features = best_roc_auc['max_features'])
    roc_model.fit(X_train, y_train)
    y_roc_train = roc_model.predict(X_train)
    roc_train_score = accuracy_score(y_train, y_roc_train)
    
    y_roc_test = roc_model.predict(X_test)
    roc_test_score = accuracy_score(y_test, y_roc_test)
    
    # F1
    solution['F1'] = hyperparams.cv_results_['mean_test_f1']
    best_f1 = results[np.argmin(hyperparams.cv_results_['rank_test_f1'])]
    f1_model = RandomForestClassifier(n_estimators = best_f1['n_estimators'],
                                      criterion = best_f1['criterion'],
                                      max_features = best_f1['max_features'])
    f1_model.fit(X_train, y_train)
    y_f1_train = f1_model.predict(X_train)
    f1_train_score = accuracy_score(y_train, y_f1_train)
    
    y_f1_test = f1_model.predict(X_test)
    f1_test_score = accuracy_score(y_test, y_f1_test)
    
    train_metrics_rf = train_metrics_rf.append({'Accuracy': acc_train_score, 'Precision': prec_train_score,
                                                'AUC': roc_train_score, 'F1': f1_train_score}, ignore_index=True)
    
    test_metrics_rf = test_metrics_rf.append({'Accuracy': acc_test_score, 'Precision': prec_test_score,
                                              'AUC': roc_test_score, 'F1': f1_test_score}, ignore_index=True)

In [150]:
train_metrics_rf

Unnamed: 0,Accuracy,Precision,AUC,F1
0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0


In [151]:
test_metrics_rf

Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.846949,0.846005,0.845905,0.84685
1,0.848541,0.848466,0.849063,0.849386
2,0.84685,0.846527,0.846253,0.845607
3,0.848391,0.848491,0.849262,0.84951
4,0.846228,0.847894,0.848168,0.846577


# DECISION TREE CLASSIFIER

In [145]:
num_trials = 5
train_metrics_dt = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])
test_metrics_dt = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])

for i in range(num_trials):
    pipe = Pipeline(steps=[('classifier', DecisionTreeClassifier())])

    parameters = [{'classifier': [DecisionTreeClassifier(class_weight='balanced')],
                   'classifier__criterion': ['gini', 'entropy'],
                   'classifier__splitter': ['best'],
                   'classifier__min_samples_leaf': [1,2,4,6,8,10,12,14,16,18,20]}]

    clf = GridSearchCV(estimator=pipe, param_grid=parameters, cv=KFold(n_splits=5), 
                       scoring=['accuracy', 'precision', 'roc_auc', 'f1'], refit=False, verbose=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
    hyperparams = clf.fit(X_train, y_train)
    results = hyperparams.cv_results_['params']
    solution = pd.DataFrame(results)

    # ACCURACY
    solution['Accuracy'] = hyperparams.cv_results_['mean_test_accuracy']
    best_accuracy = results[np.argmin(hyperparams.cv_results_['rank_test_accuracy'])]
    print(best_accuracy)
    
    accuracy_model = DecisionTreeClassifier(criterion = best_accuracy['classifier__criterion'],
                                            splitter = best_accuracy['classifier__splitter'],
                                            min_samples_leaf = best_accuracy['classifier__min_samples_leaf'],
                                            class_weight = 'balanced')

    accuracy_model.fit(X_train, y_train)
    y_acc_train = accuracy_model.predict(X_train)
    acc_train_score = accuracy_score(y_train, y_acc_train)
    
    #accuracy_model.fit(X_test, y_test)
    y_acc_test = accuracy_model.predict(X_test)
    acc_test_score = accuracy_score(y_test, y_acc_test)
    
    # PRECISION
    solution['Precision'] = hyperparams.cv_results_['mean_test_precision']
    best_precision = results[np.argmin(hyperparams.cv_results_['rank_test_precision'])]
    precision_model = DecisionTreeClassifier(criterion = best_precision['classifier__criterion'],
                                             splitter = best_precision['classifier__splitter'],
                                             min_samples_leaf = best_precision['classifier__min_samples_leaf'],
                                             class_weight = 'balanced')
    precision_model.fit(X_train, y_train)
    y_prec_train = precision_model.predict(X_train)
    prec_train_score = accuracy_score(y_train, y_prec_train)
    
    #precision_model.fit(X_test, y_test)
    y_prec_test = precision_model.predict(X_test)
    prec_test_score = accuracy_score(y_test, y_prec_test)
    
    # ROC AUC
    solution['ROC AUC'] = hyperparams.cv_results_['mean_test_roc_auc']
    best_roc_auc = results[np.argmin(hyperparams.cv_results_['rank_test_roc_auc'])]
    roc_model = DecisionTreeClassifier(criterion = best_roc_auc['classifier__criterion'],
                                       splitter = best_roc_auc['classifier__splitter'],
                                       min_samples_leaf = best_roc_auc['classifier__min_samples_leaf'],
                                       class_weight = 'balanced')
    roc_model.fit(X_train, y_train)
    y_roc_train = roc_model.predict(X_train)
    roc_train_score = accuracy_score(y_train, y_roc_train)
    
    #roc_model.fit(X_test, y_test)
    y_roc_test = roc_model.predict(X_test)
    roc_test_score = accuracy_score(y_test, y_roc_test)
    
    # F1
    solution['F1'] = hyperparams.cv_results_['mean_test_f1']
    best_f1 = results[np.argmin(hyperparams.cv_results_['rank_test_f1'])]
    f1_model = DecisionTreeClassifier(criterion = best_f1['classifier__criterion'],
                                      splitter = best_f1['classifier__splitter'],
                                      min_samples_leaf = best_f1['classifier__min_samples_leaf'],
                                      class_weight = 'balanced')
    f1_model.fit(X_train, y_train)
    y_f1_train = f1_model.predict(X_train)
    f1_train_score = accuracy_score(y_train, y_f1_train)
    
    #f1_model.fit(X_test, y_test)
    y_f1_test = f1_model.predict(X_test)
    f1_test_score = accuracy_score(y_test, y_f1_test)
    
    train_metrics_dt = train_metrics_dt.append({'Accuracy': acc_train_score, 'Precision': prec_train_score,
                                                'AUC': roc_train_score, 'F1': f1_train_score}, ignore_index=True)
    
    test_metrics_dt = test_metrics_dt.append({'Accuracy': acc_test_score, 'Precision': prec_test_score,
                                              'AUC': roc_test_score, 'F1': f1_test_score}, ignore_index=True)

Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 110 out of 110 | elapsed:    3.8s finished


{'classifier': DecisionTreeClassifier(class_weight='balanced'), 'classifier__criterion': 'gini', 'classifier__min_samples_leaf': 1, 'classifier__splitter': 'best'}
Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 110 out of 110 | elapsed:    4.7s finished


{'classifier': DecisionTreeClassifier(class_weight='balanced'), 'classifier__criterion': 'entropy', 'classifier__min_samples_leaf': 1, 'classifier__splitter': 'best'}
Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 110 out of 110 | elapsed:    4.2s finished


{'classifier': DecisionTreeClassifier(class_weight='balanced'), 'classifier__criterion': 'entropy', 'classifier__min_samples_leaf': 1, 'classifier__splitter': 'best'}
Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 110 out of 110 | elapsed:    4.0s finished


{'classifier': DecisionTreeClassifier(class_weight='balanced'), 'classifier__criterion': 'entropy', 'classifier__min_samples_leaf': 1, 'classifier__splitter': 'best'}
Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 110 out of 110 | elapsed:    3.9s finished


{'classifier': DecisionTreeClassifier(class_weight='balanced'), 'classifier__criterion': 'entropy', 'classifier__min_samples_leaf': 1, 'classifier__splitter': 'best'}


In [146]:
print(train_metrics_dt['Accuracy'].mean())
train_metrics_dt

1.0


Unnamed: 0,Accuracy,Precision,AUC,F1
0,1.0,0.819,0.8276,1.0
1,1.0,0.8232,0.8232,1.0
2,1.0,0.8252,0.8152,1.0
3,1.0,0.829,0.8268,1.0
4,1.0,0.8128,0.8186,1.0


In [147]:
print(test_metrics_dt['Accuracy'].mean())
test_metrics_dt

0.804335935557655


Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.807369,0.781886,0.789866,0.806996
1,0.798841,0.791557,0.791557,0.798419
2,0.807096,0.783228,0.773731,0.807145
3,0.803118,0.794814,0.79285,0.80262
4,0.805256,0.775645,0.777957,0.805306


In [143]:
solution

Unnamed: 0,classifier,classifier__criterion,classifier__min_samples_leaf,classifier__splitter,Accuracy,Precision,ROC AUC,F1
0,DecisionTreeClassifier(class_weight='balanced'),gini,2,best,0.7936,0.887637,0.770319,0.858301
1,DecisionTreeClassifier(class_weight='balanced'),gini,4,best,0.7778,0.89939,0.810382,0.842967
2,DecisionTreeClassifier(class_weight='balanced'),gini,6,best,0.7796,0.909297,0.837205,0.842774
3,DecisionTreeClassifier(class_weight='balanced'),gini,8,best,0.779,0.912253,0.84852,0.841704
4,DecisionTreeClassifier(class_weight='balanced'),gini,10,best,0.7782,0.916565,0.855991,0.840325
5,DecisionTreeClassifier(class_weight='balanced'),gini,12,best,0.777,0.917406,0.860456,0.839145
6,DecisionTreeClassifier(class_weight='balanced'),gini,14,best,0.778,0.916138,0.86427,0.840174
7,DecisionTreeClassifier(class_weight='balanced'),gini,16,best,0.785,0.917404,0.870177,0.846042
8,DecisionTreeClassifier(class_weight='balanced'),gini,18,best,0.7868,0.918679,0.876871,0.847365
9,DecisionTreeClassifier(class_weight='balanced'),gini,20,best,0.791,0.919137,0.878659,0.850654
