In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
colnames = ['letter', 'x-box','y-box', 'width', 'height', 'pixels', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']
df = pd.read_csv('letter-recognition.csv', names= colnames)
df['letter'] = df['letter'].apply(lambda x: 1 if x == 'O' else -1)

X = df.drop(['letter'], axis=1)
y = df['letter']
df['letter'].value_counts()

-1    19247
 1      753
Name: letter, dtype: int64

In [3]:
# Transform X values into One Hot Encoding for categorical variables and Standardizing for numerical variables
cont = list(X.select_dtypes(['int64']).columns)

cont_transform = Pipeline(steps=[('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('cont', cont_transform, cont)])
X = pd.DataFrame(preprocessor.fit_transform(X))
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-1.057698,0.291877,-1.053277,-0.164704,-1.144013,0.544130,2.365097,-1.714360,0.344994,-0.917071,1.347774,0.034125,-1.305948,-0.219082,-1.438153,0.122911
1,0.510385,1.502358,-1.053277,0.719730,-0.687476,1.531305,-1.075326,0.137561,-0.495072,1.895968,-1.312807,0.514764,-0.448492,-0.219082,0.120081,1.359441
2,-0.012309,1.199738,0.435910,1.161947,1.138672,1.531305,-0.645273,-0.973591,0.344994,0.690380,-1.312807,-0.446513,-0.019764,-0.865626,-0.269477,0.741176
3,1.555774,1.199738,0.435910,0.277513,-0.230939,-0.936631,0.644886,-0.232823,0.344994,-1.720796,-0.932724,0.995402,1.266419,1.074008,-0.659036,0.122911
4,-1.057698,-1.826464,-1.053277,-1.933571,-1.144013,0.544130,-0.645273,0.507945,0.344994,-0.917071,-0.552641,0.514764,-0.877220,-0.865626,0.509640,1.359441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,-1.057698,-1.523844,-1.053277,-1.049137,-0.687476,0.050543,-0.215220,0.878329,0.344994,-0.917071,-0.172558,-1.888428,-0.448492,-0.219082,-0.269477,-0.495354
19996,1.555774,0.897117,1.428701,1.161947,0.225598,-1.430218,0.214833,0.507945,1.605094,1.494105,0.967691,2.437316,-0.448492,0.427463,-0.269477,-0.495354
19997,1.033079,0.594497,0.435910,0.719730,0.682135,-0.443044,1.504991,-0.603207,0.765028,1.092242,0.967691,-1.407789,-0.448492,2.367097,-0.659036,-2.350149
19998,-1.057698,-1.221224,-0.556881,-1.491354,-1.144013,0.544130,-0.215220,-0.973591,0.344994,0.690380,-0.172558,0.034125,-0.877220,0.427463,0.509640,0.122911


In [12]:
# Logistic Regression
pipe = Pipeline(steps=[('classifier', LogisticRegression())])

parameters = [{'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1, class_weight='balanced')],
               'classifier__solver': ['saga'],
               'classifier__penalty': ['l1'],
               'classifier__C': np.logspace(-8,4,13)},
              {'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1, class_weight='balanced')],
               'classifier__solver': ['sag', 'saga'],
               'classifier__penalty': ['none']},
              {'classifier': [LogisticRegression(max_iter=5000, n_jobs=-1, class_weight='balanced')],
               'classifier__solver': ['sag', 'saga'],
               'classifier__penalty': ['l2'],
               'classifier__C': np.logspace(-8,4,13)}]

clf = GridSearchCV(pipe, parameters, cv=StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'precision', 'roc_auc', 'f1'], refit=False)

In [13]:
num_trials = 5
train_metrics = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])
test_metrics = pd.DataFrame(columns=['Accuracy','Precision','AUC','F1'])

for i in range(num_trials):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=5000, shuffle=True)
    hyperparams = clf.fit(X_train, y_train)
    results = hyperparams.cv_results_['params']
    solution = pd.DataFrame(results)
    
    # ACCURACY
    solution['Accuracy'] = hyperparams.cv_results_['mean_test_accuracy']
    best_accuracy = results[np.argmin(hyperparams.cv_results_['rank_test_accuracy'])]
    accuracy_model = LogisticRegression(penalty = best_accuracy['classifier__penalty'],
                                        C = best_accuracy['classifier__C'],
                                        solver = best_accuracy['classifier__solver'],
                                        max_iter = 5000,
                                        n_jobs = -1, 
                                        class_weight='balanced')
    accuracy_model.fit(X_train, y_train)
    y_acc_train = accuracy_model.predict(X_train)
    acc_train_score = accuracy_score(y_train, y_acc_train)
    
    accuracy_model.fit(X_test, y_test)
    y_acc_test = accuracy_model.predict(X_test)
    acc_test_score = accuracy_score(y_test, y_acc_test)
    
    # PRECISION
    solution['Precision'] = hyperparams.cv_results_['mean_test_precision']
    best_precision = results[np.argmin(hyperparams.cv_results_['rank_test_precision'])]
    precision_model = LogisticRegression(penalty = best_precision['classifier__penalty'],
                                         C = best_precision['classifier__C'],
                                         solver = best_precision['classifier__solver'],
                                         max_iter = 5000,
                                         n_jobs = -1,
                                         class_weight='balanced')
    precision_model.fit(X_train, y_train)
    y_prec_train = precision_model.predict(X_train)
    prec_train_score = accuracy_score(y_train, y_prec_train)
    
    precision_model.fit(X_test, y_test)
    y_prec_test = precision_model.predict(X_test)
    prec_test_score = accuracy_score(y_test, y_prec_test)
    
    # ROC AUC
    solution['ROC AUC'] = hyperparams.cv_results_['mean_test_roc_auc']
    best_roc_auc = results[np.argmin(hyperparams.cv_results_['rank_test_roc_auc'])]
    roc_model = LogisticRegression(penalty = best_roc_auc['classifier__penalty'],
                                   C = best_roc_auc['classifier__C'],
                                   solver = best_roc_auc['classifier__solver'],
                                   max_iter = 5000,
                                   n_jobs = -1, 
                                   class_weight='balanced')
    roc_model.fit(X_train, y_train)
    y_roc_train = roc_model.predict(X_train)
    roc_train_score = accuracy_score(y_train, y_roc_train)
    
    roc_model.fit(X_test, y_test)
    y_roc_test = roc_model.predict(X_test)
    roc_test_score = accuracy_score(y_test, y_roc_test)
    
    # F1
    solution['F1'] = hyperparams.cv_results_['mean_test_f1']
    best_f1 = results[np.argmin(hyperparams.cv_results_['rank_test_f1'])]
    f1_model = LogisticRegression(penalty = best_f1['classifier__penalty'],
                                  C = best_f1['classifier__C'],
                                  solver = best_f1['classifier__solver'],
                                  max_iter = 5000,
                                  n_jobs = -1, 
                                  class_weight='balanced')
    f1_model.fit(X_train, y_train)
    y_f1_train = f1_model.predict(X_train)
    f1_train_score = accuracy_score(y_train, y_f1_train)
    
    f1_model.fit(X_test, y_test)
    y_f1_test = f1_model.predict(X_test)
    f1_test_score = accuracy_score(y_test, y_f1_test)
    
    train_metrics = train_metrics.append({'Accuracy': acc_train_score, 'Precision': prec_train_score, 
                                          'AUC': roc_train_score, 'F1': f1_train_score}, ignore_index=True)
    
    test_metrics = test_metrics.append({'Accuracy': acc_test_score, 'Precision': prec_test_score, 
                                          'AUC': roc_test_score, 'F1': f1_test_score}, ignore_index=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [14]:
train_metrics

Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.7222,0.7836,0.7828,0.7836
1,0.7928,0.7926,0.7858,0.7928
2,0.7152,0.7852,0.7792,0.785
3,0.0406,0.7832,0.7766,0.7832
4,0.7188,0.7928,0.79,0.7928


In [15]:
test_metrics

Unnamed: 0,Accuracy,Precision,AUC,F1
0,0.717867,0.782733,0.781733,0.782733
1,0.7786,0.7786,0.778267,0.778533
2,0.7202,0.7812,0.7814,0.7812
3,0.963333,0.784667,0.783467,0.784667
4,0.719,0.778333,0.777467,0.778333
