# LogisticRegression
This notebook will contain my attempts at running logistic regrssion on the different datasets.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
from sklearn.metrics import accuracy_score, auc, roc_auc_score, make_scorer
from sklearn.linear_model import LogisticRegression, RidgeClassifier

## eBoss Data

Reading in the eboss data

In [3]:
raw_data_eBoss = pd.read_csv("../../../Data/Astronomy.csv")

Taking out the id column

In [9]:
trainable_data = raw_data_eBoss.iloc[:, 1:].copy(deep=True)

Seperating the X, and Y variables, and doing needed preprocessing.

In [10]:
X = trainable_data.iloc[:, :-1].copy(deep=True)
Y = trainable_data.iloc[:, -1:].copy(deep=True)
X = X.apply(pd.to_numeric, args={'errors': 'coerce'})
X = X.fillna('0')
Y = Y.Hits.map({'bad': 0, 'good': 1})

ss = MinMaxScaler()
X_scaled = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X = X_scaled

In [11]:
# Defining all the scoring metrics that we need to keep track of.
scoring_dict = {
    'precision (no)': make_scorer(precision_score, pos_label=0),
    'precision (yes)': make_scorer(precision_score, pos_label=1),
    'recall (no)': make_scorer(recall_score, pos_label=0),
    'recall (yes)': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(auc),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
}

# Setting up the params.
params_grid = {"C": [10, 100]}

# Running the cross-validation with 10 folds using Logistic Regression Algorithm.
grid_clf = GridSearchCV(LogisticRegression(),
                        cv=KFold(n_splits=10, shuffle=True),
                        param_grid=params_grid,
                        scoring=scoring_dict,
                        refit='accuracy', 
                        return_train_score=False).fit(X,Y)

# Extracting relevant metrics. More detailed metrics are available in the cv_results_ object.
relevant_metric = {'mean_test_{}'.format(k): grid_clf.cv_results_['mean_test_{}'.format(k)] for k in scoring_dict.keys()}
relevant_metric['params'] = grid_clf.cv_results_['params']
relevant_metric

{'mean_test_precision (no)': array([0.8727758 , 0.86032028]),
 'mean_test_precision (yes)': array([0.94076012, 0.94157107]),
 'mean_test_recall (no)': array([0.79280913, 0.78810371]),
 'mean_test_recall (yes)': array([0.96782768, 0.96804336]),
 'mean_test_accuracy': array([0.92882562, 0.92882562]),
 'mean_test_auc': array([0.54982206, 0.54982206]),
 'mean_test_roc_auc': array([0.8803184 , 0.87807354]),
 'mean_test_f1': array([0.95364293, 0.9541764 ]),
 'params': [{'C': 10}, {'C': 100}]}

In [12]:
grid_clf.best_score_, grid_clf.best_params_

(0.9288256227758007, {'C': 10})

# Manga Data:

In [5]:
raw_data_Manga = pd.read_csv("../../../Data/Astronomy20000_Original.csv")

Seperating the X, and Y variables, and doing needed preprocessing.

In [7]:
X = raw_data_Manga.iloc[:, 1:-1].copy(deep=True)
Y = raw_data_Manga.iloc[:, [-1]].copy(deep=True)
X = X.iloc[:, np.r_[2:14, 26:len(X.columns)]].copy(deep=True)
X = X.fillna('0')
Y = Y.Hits.map({'bad': 0, 'good': 1})

ss = MinMaxScaler()
X_scaled = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X = X_scaled

In [8]:
X.head()

Unnamed: 0,o2sn,emsn1,emsn2,emsn3,emsn4,emsn5,emsn6,emsn7,emsn8,emsn9,...,HIT_FWHM_data10,G_FAIL_data10,EMLINE_data11,HIT_PAR1_data11,HIT_PAR2_data11,HIT_PAR3_data11,HIT_PAR4_data11,HIT_CHI2_data11,HIT_FWHM_data11,G_FAIL_data11
0,0.032818,0.0,0.01726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.836414,0.774998,0.000662,0.112789,4.936936e-05,0.607714,0.0
1,0.02648,0.0,0.003182,0.0,0.0,0.000328,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.702873,0.778544,0.000305,0.112776,0.0001424371,0.46832,0.0
2,0.019147,0.065948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.90225,0.77944,0.003599,0.112765,9.868064e-06,0.330403,0.0
3,0.038789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.938823,0.786979,0.004008,0.112762,2.829325e-08,0.308263,0.0
4,0.050166,0.0,0.030492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.850297,0.753776,0.002602,0.112798,0.0002814745,0.70981,0.0


Running CV with LR

In [15]:
# Defining all the scoring metrics that we need to keep track of.
scoring_dict = {
    'precision (no)': make_scorer(precision_score, pos_label=0),
    'precision (yes)': make_scorer(precision_score, pos_label=1),
    'recall (no)': make_scorer(recall_score, pos_label=0),
    'recall (yes)': make_scorer(recall_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(auc, reorder=True),
    'roc_auc': make_scorer(roc_auc_score),
    'f1': make_scorer(f1_score),
}

# Setting up the params.
params_grid = {"C": [.01, .1, 1, 10, 100]}

# Running the cross-validation with 10 folds using Logistic Regression Algorithm.
grid_clf = GridSearchCV(LogisticRegression(),
                        cv=KFold(n_splits=10, shuffle=True),
                        param_grid=params_grid,
                        scoring=scoring_dict,
                        refit='accuracy', 
                        return_train_score=False).fit(X,Y)
grid_clf.fit(X,Y)

# Extracting relevant metrics. More detailed metrics are available in the cv_results_ object.
relevant_metric = {'mean_test_{}'.format(k): grid_clf.cv_results_['mean_test_{}'.format(k)] for k in scoring_dict.keys()}
relevant_metric

{'mean_test_precision (no)': array([0.82702501, 0.83968753, 0.84337318, 0.84792721, 0.85175248]),
 'mean_test_precision (yes)': array([0.77557912, 0.86558419, 0.8890305 , 0.8883092 , 0.88263552]),
 'mean_test_recall (no)': array([0.96196754, 0.97738423, 0.98129677, 0.98033174, 0.97846586]),
 'mean_test_recall (yes)': array([0.39580484, 0.43957109, 0.45255762, 0.47187259, 0.48851241]),
 'mean_test_accuracy': array([0.82050789, 0.84306464, 0.84926895, 0.85335706, 0.8560985 ]),
 'mean_test_auc': array([0.5, 0.5, 0.5, 0.5, 0.5]),
 'mean_test_roc_auc': array([0.67888619, 0.70847766, 0.71692719, 0.72610217, 0.73348914]),
 'mean_test_f1': array([0.52387532, 0.5829154 , 0.59963889, 0.61611787, 0.62871265])}

In [16]:
grid_clf.best_score_, grid_clf.best_params_

(0.856098499422855, {'C': 100})