In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import cv2

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)

import matplotlib.pyplot as plt
import matplotlib.style  as style

from tqdm  import tqdm
from sklearn.metrics     import accuracy_score, roc_auc_score
from sklearn.linear_model      import LogisticRegression
from sklearn.model_selection   import train_test_split

In [None]:
!pip install scikit-learn  -U

In [None]:
df = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df["study_class"] = df.apply(lambda x: "none" if "none" in x["label"] else "opacity", axis = 1)

In [None]:
df.drop(df.columns[1:4], axis=1, inplace=True)

In [None]:
df

In [None]:
df["study_class"].unique()

In [None]:
study_class_to_num = {"none":0, "opacity":1}

In [None]:
study_class_to_num

In [None]:
df["study_class"] = df["study_class"].apply(lambda x: study_class_to_num[x])

In [None]:
df.head()

In [None]:
!ls -l  '../input/siims-c19-64x64-image-study-png/image/000a312787f2_image.png'

In [None]:
df["file_path"] = df.apply(lambda x: f'../input/siims-c19-64x64-image-study-png/image/{x["id"]}.png', axis=1)

In [None]:
df.head()

In [None]:
# Split data into training and testing sets
train_df, test_df, train_y, test_y = train_test_split(df,
                                                   df['study_class'],
                                                   stratify     = df['study_class'],
                                                   test_size    = 0.33,
                                                   random_state = 451)

In [None]:
train_df.head()

In [None]:
# # Split once more, so that we may produce a validation set
# #labels = train_df.pop('target')
# train_df, valid_df, train_y, Valid_y = train_test_split(train_df,
#                                                         train_df["study_class"],
#                                                         stratify     = train_df["study_class"],
#                                                         test_size    = 0.2,
#                                                         random_state = 451)

# # Reassemble labels
# # train_df['target'] = train_y
# # probe_df['target'] = probe_y

In [None]:
def plot_multiple_images(image_dataframe, rows = 4, columns = 4, figsize = (16, 20), preprocessing=None):
    '''
    Plots Multiple Images
    Reads, resizes, applies preprocessing if desired and plots multiple images from a given dataframe
    '''
    image_dataframe = image_dataframe.reset_index(drop=True)
    fig = plt.figure(figsize=figsize)
    ax  = []

    for i in range(rows * columns):
        img = plt.imread(image_dataframe.loc[i,'file_path'])
        #img = cv2.resize(img, resize)
        
        if preprocessing:
            img = preprocessing(img)
        
        ax.append(fig.add_subplot(rows, columns, i+1) )
        ax[-1].set_title("Xray "+str(i+1))
        plt.imshow(img, alpha=1, cmap='gray')
    
    plt.show()

In [None]:
plot_multiple_images(train_df)

In [None]:
def load_image(image_path, image_dims = (128,128), grayscale=True, flatten=True, interpolation = cv2.INTER_AREA):
    '''
    Loads an image, resizes and removes redudant channels if so desired
    '''
    image         = cv2.imread(image_path)
    #resized_image = cv2.resize(image, image_dims, interpolation = interpolation)
    resized_image = image
    
    if grayscale:
        resized_image = resized_image[:,:,0]
    
    if flatten:
        resized_image = resized_image.flatten()
    
    return(resized_image)

In [None]:
def create_flattened_dataframe(df, interpolation = cv2.INTER_AREA):
    df     = df.reset_index(drop=True)
    result = pd.DataFrame()
    
    for i in tqdm(range(df.shape[0])):
        im_path = df.loc[i,'file_path']
        current = load_image(im_path, interpolation = interpolation).tolist()
        current = current
        current = pd.DataFrame(current).T
        result  = result.append(current)
    
    #result["study_class"] = df["study_class"]
    
    return(result)

In [None]:
flat_train_df = create_flattened_dataframe(train_df)
#flat_valid_df = create_flattened_dataframe(valid_df)
flat_test_df = create_flattened_dataframe(test_df)

In [None]:
flat_train_df.info()

In [None]:
# parameters = {
#     'cls__estimator__penalty': ['l2'],
#     'cls__estimator__C': [1, 5, 7, 10],
#     'cls__estimator__max_iter': [50, 100, 300, 500],
#     'cls__estimator__solver' : ['lbfgs'],
# }

In [None]:
# from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# # define model
# lr = LogisticRegression(class_weight='balanced', n_jobs=32)
# #classifier = OneVsRestClassifier(lr, n_jobs=32)
# pipeline = Pipeline([
#     ('cls', LogisticRegression(class_weight='balanced', n_jobs=32), n_jobs=32),
# ])

grid = {
    "C": [0.1, 1, 10, 100], 
    "class_weight": ['none'], 
    "penalty":["l1","l2","none"],
    "solver":["saga"]
}# l1 lasso l2 ridge

lr = LogisticRegression(n_jobs=32)
lr_cv = GridSearchCV(lr, grid, cv = 3, verbose=3)

In [None]:
%time
# skf = StratifiedKFold(n_splits=2)
#grid_search_tune = GridSearchCV(pipeline, parameters, cv=skf.split(flat_train_df, train_df["study_class"]),  verbose=3)
lr_cv.fit(flat_train_df, train_df["study_class"])

print("Best Score: ", lr_cv.best_score_)
print("Best Params: ", lr_cv.best_params_)

In [None]:
# # Create Logistic Regression
# #logit_model = LogisticRegression(random_state=451, solver='lbfgs', n_jobs=-1)

# #from sklearn.multiclass import OneVsRestClassifier
# #from sklearn.linear_model import LogisticRegressionCV

# lr = LogisticRegression(class_weight='none',
#                         C=1,
#                         penalty='l2',
#                         max_iter=500,
#                         solver='saga',
#                         n_jobs=32)
# lr.fit(flat_train_df, train_df["study_class"])

In [None]:
# logit_preds_val  = lr.predict_proba(flat_test_df)
# evaluate_predictions(logit_preds_val[:,1], eval_df = test_df)
# lr.score(flat_test_df, test_df["study_class"])

In [None]:
def evaluate_predictions(preds, eval_df = test_df):
    '''
    Evaluate Predictions Function
    Returns accuracy and auc of the model
    '''
    auroc = roc_auc_score(eval_df['study_class'].astype('uint8'), preds)
    accur = accuracy_score(eval_df['study_class'].astype('uint8'), preds >= 0.5)
    print('Accuracy: ' + str(auroc))
    print('AUC: ' + str(accur))

In [None]:
# # Evaluate Model Results - Validation Set
# logit_preds_val  = logit_model.predict_proba(flat_valid_df)
# #evaluate_predictions(logit_preds_val[:,1], eval_df = valid_df)
# logit_model.score(flat_valid_df, valid_df["study_class"])

In [None]:
# Evaluate Model Results - Validation Set
logit_preds_val  = lr_cv.predict_proba(flat_test_df)
evaluate_predictions(logit_preds_val[:,1], eval_df = test_df)
lr_cv.score(flat_test_df, test_df["study_class"])

In [None]:
from sklearn import metrics

#Creating matplotlib axes object to assign figuresize and figure title
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Confusion Matrx')

disp = metrics.plot_confusion_matrix(lr_cv, flat_test_df, test_df["study_class"], ax = ax)
disp.confusion_matrix