In [73]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

from sklearn.utils import resample
import warnings
#import sklearn.exceptions
#warnings.filterwarnings("error", category=sklearn.exceptions.UndefinedMetricWarning)

In [79]:
colnames = ['letter', 'x-box','y-box', 'width', 'height', 'pixels', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']
df = pd.read_csv('letter-recognition.csv', names= colnames)
df['letter'] = df['letter'].apply(lambda x: 1 if x == 'O' else -1)

X = df.drop(['letter'], axis=1)
y = df['letter']
df['letter'].value_counts()

-1    19247
 1      753
Name: letter, dtype: int64

In [80]:
percent_pos = (df['letter']==1).sum()/len(df['letter'])*100
percent_pos

3.765

In [81]:
# Transform X values into One Hot Encoding for categorical variables and Standardizing for numerical variables
cont = list(X.select_dtypes(['int64']).columns)

cont_transform = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('cont', cont_transform, cont)])
X = pd.DataFrame(preprocessor.fit_transform(X))
X.shape

(20000, 16)

In [86]:
%%debug
num_trials = 5
train_metrics = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])
test_metrics = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])

for i in range(num_trials):
    pipe = Pipeline(steps=[('classifier', LogisticRegression())])

    parameters = [{'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1, class_weight='balanced')],
                   'classifier__solver': ['saga'],
                   'classifier__penalty': ['l1'],
                   'classifier__C': np.logspace(-8,4,13)},
                  {'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1, class_weight='balanced')],
                   'classifier__solver': ['sag', 'saga'],
                   'classifier__penalty': ['none']},
                  {'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1, class_weight='balanced')],
                  'classifier__solver': ['sag', 'saga'],
                  'classifier__penalty': ['l2'],
                  'classifier__C': np.logspace(-8,4,13)}]

    clf = GridSearchCV(pipe, parameters, cv=KFold(n_splits=5), 
                       scoring='precision', refit=True, verbose=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
    
    # Oversampling minority classes
    # https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18
    data = pd.concat([X_train, y_train], axis=1)
    #print(data)

    neg = data[data['letter'] == -1]
    pos = data[data['letter'] == 1]
    
    pos_balanced = resample(pos, replace=True, n_samples=len(neg), random_state=20)
    balanced = pd.concat([neg, pos_balanced])
    #print(balanced)
    
    X_train = balanced.drop(['letter'], axis=1)
    y_train = balanced['letter']
    print(y_train.value_counts())
    
    hyperparams = clf.fit(X_train, y_train)

    '''
    results = hyperparams.cv_results_['params']
    solution = pd.DataFrame(results)
    
    # ACCURACY
    solution['Accuracy'] = hyperparams.cv_results_['mean_test_accuracy']
    best_accuracy = results[np.argmin(hyperparams.cv_results_['rank_test_accuracy'])]
    accuracy_model = LogisticRegression(penalty = best_accuracy['classifier__penalty'],
                                        C = best_accuracy['classifier__C'],
                                        solver = best_accuracy['classifier__solver'],
                                        max_iter = 5000,
                                        n_jobs = -1)
    accuracy_model.fit(X_train, y_train)
    y_acc_train = accuracy_model.predict(X_train)
    acc_train_score = accuracy_score(y_train, y_acc_train)
    
    y_acc_test = accuracy_model.predict(X_test)
    acc_test_score = accuracy_score(y_test, y_acc_test)
    
    print(classification_report(y_test, y_acc_test))
    
    # PRECISION
    solution['Precision'] = hyperparams.cv_results_['mean_test_precision']
    best_precision = results[np.argmin(hyperparams.cv_results_['rank_test_precision'])]
    precision_model = LogisticRegression(penalty = best_precision['classifier__penalty'],
                                         C = best_precision['classifier__C'],
                                         solver = best_precision['classifier__solver'],
                                         max_iter = 5000,
                                         n_jobs = -1)
    precision_model.fit(X_train, y_train)
    y_prec_train = precision_model.predict(X_train)
    prec_train_score = accuracy_score(y_train, y_prec_train)
    
    y_prec_test = precision_model.predict(X_test)
    prec_test_score = accuracy_score(y_test, y_prec_test)
    
    print(classification_report(y_test, y_prec_test))
    
    # ROC AUC
    solution['ROC AUC'] = hyperparams.cv_results_['mean_test_roc_auc']
    best_roc_auc = results[np.argmin(hyperparams.cv_results_['rank_test_roc_auc'])]
    roc_model = LogisticRegression(penalty = best_roc_auc['classifier__penalty'],
                                   C = best_roc_auc['classifier__C'],
                                   solver = best_roc_auc['classifier__solver'],
                                   max_iter = 5000,
                                   n_jobs = -1)
    roc_model.fit(X_train, y_train)
    y_roc_train = roc_model.predict(X_train)
    roc_train_score = accuracy_score(y_train, y_roc_train)
    
    y_roc_test = roc_model.predict(X_test)
    roc_test_score = accuracy_score(y_test, y_roc_test)
    
    print(classification_report(y_test, y_roc_test))
    
    # F1
    solution['F1'] = hyperparams.cv_results_['mean_test_f1']
    best_f1 = results[np.argmin(hyperparams.cv_results_['rank_test_f1'])]
    f1_model = LogisticRegression(penalty = best_f1['classifier__penalty'],
                                  C = best_f1['classifier__C'],
                                  solver = best_f1['classifier__solver'],
                                  max_iter = 5000,
                                  n_jobs = -1)
    f1_model.fit(X_train, y_train)
    y_f1_train = f1_model.predict(X_train)
    f1_train_score = accuracy_score(y_train, y_f1_train)
    
    y_f1_test = f1_model.predict(X_test)
    f1_test_score = accuracy_score(y_test, y_f1_test)
    
    print(classification_report(y_test, y_f1_test))
    
    train_metrics = train_metrics.append({'Accuracy': acc_train_score, 'Precision': prec_train_score, 
                                          'AUC': roc_train_score, 'F1': f1_train_score}, ignore_index=True)
    
    test_metrics = test_metrics.append({'Accuracy': acc_test_score, 'Precision': prec_test_score, 
                                          'AUC': roc_test_score, 'F1': f1_test_score}, ignore_index=True)
    '''

NOTE: Enter 'c' at the ipdb>  prompt to continue execution.
> [0;32m<string>[0m(2)[0;36m<module>[0;34m()[0m

ipdb> c
-1    4800
 1    4800
Name: letter, dtype: int64
Fitting 5 folds for each of 41 candidates, totalling 205 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[0;31m---------------------------------------------------------------------------[0m
[0;32m~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py[0m in [0;36minner_f[0;34m(*args, **kwargs)[0m
[1;32m     71[0m         [0mkwargs[0m[0;34m.[0m[0mupdate[0m[0;34m([0m[0;34m{[0m[0mk[0m[0;34m:[0m [0marg[0m [0;32mfor[0m [0mk[0m[0;34m,[0m [0marg[0m [0;32min[0m [0mzip[0m[0;34m([0m[0msig[0m[0;34m.[0m[0mparameters[0m[0;34m,[0m [0margs[0m[0;34m)[0m[0;34m}[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 72[0;31m         [0;32mreturn[0m [0mf[0m[0;34m([0m[0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     73[0m     [0;32mreturn[0m [0minner_f[0m[0;34m[0m[0;34m[0m[0m
[1;32m     74[0m [0;34m[0m[0m

[0;32m~/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py[0m in [0;36mfit[0;34m(self, X, y, groups, **fit_params)[0m
[1;32m    734[0m                 [

In [10]:
train_metrics

Unnamed: 0,Accuracy,Precision,AUC,F1


In [7]:
test_metrics

Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.7864,0.7864,0.780867,0.7864
1,0.7796,0.7782,0.7796,0.7796
2,0.78,0.780067,0.78,0.78
3,0.776533,0.779667,0.7766,0.7766
4,0.7812,0.7804,0.777267,0.7812


In [34]:
bla = pd.read_csv("adult_test.csv")
bla = bla.drop(bla.iloc[:, :1], axis = 1)
bla

Unnamed: 0,LG: Accuracy,LG: Precision,LG: AUC,LG: F1,RF: Accuracy,RF: Precision,RF: AUC,RF: F1,DT: Accuracy,DT: Precision,DT: AUC,DT: F1
0,0.848093,0.793049,0.848093,0.845607,0.846005,0.8468,0.846353,0.846303,0.802645,0.803938,0.795535,0.796206
1,0.845607,0.79633,0.8469,0.8468,0.845831,0.846253,0.846925,0.846949,0.804386,0.803267,0.792303,0.790339
2,0.847919,0.791035,0.847919,0.847919,0.84864,0.848491,0.848814,0.848391,0.815772,0.81525,0.793422,0.793521
3,0.84603,0.787355,0.846104,0.844538,0.849411,0.850306,0.849908,0.849933,0.797399,0.799264,0.7928,0.7928
4,0.845408,0.792029,0.846925,0.844737,0.849709,0.84951,0.84864,0.849958,0.802223,0.802969,0.775247,0.775247
