In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns

# Question 1 

For this question, you only need to do programming for Part 1.1 - 1.6. You will only be asked to provide answers in Part 1.7. 

In [44]:
PATH_TO_Q1_DATA = 'data/HW3_Q1_DATA_10000.csv' #TODO: Change if your path to data is different
df_original = pd.read_csv(PATH_TO_Q1_DATA)


### Data Exploration

In [45]:
df_original.shape

(9999, 19)

In [46]:
df_original.head()

Unnamed: 0,Sex,Intubated,Pneumonia,Age,Pregnant,Diabetes,COPD,Asthma,Immunocompromised,Hypertension,Other_Disease,Cardiovascular_disease,Obesity,Renal_disease,Smoker,Exposure_to_others_with_COVID,Has_COVID,ICU,Died
0,F,N,N,54,N,N,N,N,N,N,N,N,Y,N,N,,Y,N,N
1,M,N,Y,30,,N,N,N,N,N,N,N,N,N,N,,Y,N,N
2,F,N,N,60,N,Y,N,N,N,Y,N,Y,N,N,N,,Y,N,Y
3,M,N,Y,47,,Y,N,N,N,N,N,N,N,N,N,,Y,Y,Y
4,M,N,N,63,,N,N,N,N,Y,N,N,N,N,N,,Y,N,N


### Encode data without modifying missing values

In [47]:
# convert categorical data to numerical values
# drop one of each of the binary categories because it is not necessary (all info is encoded by one column)
# add column for missing data
# ?? drop missing values instead
df=pd.get_dummies(df_original, drop_first=True, dummy_na=True)

In [48]:
df.head()

Unnamed: 0,Age,Sex_M,Sex_nan,Intubated_Y,Intubated_nan,Pneumonia_Y,Pneumonia_nan,Pregnant_Y,Pregnant_nan,Diabetes_Y,...,Renal_disease_nan,Smoker_Y,Smoker_nan,Exposure_to_others_with_COVID_Y,Exposure_to_others_with_COVID_nan,Has_COVID_nan,ICU_Y,ICU_nan,Died_Y,Died_nan
0,54,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,30,1,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,60,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
3,47,1,0,0,0,1,0,0,1,1,...,0,0,0,0,1,0,1,0,1,0
4,63,1,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [49]:
df.sum(axis='rows')

Age                                  551802
Sex_M                                  6267
Sex_nan                                   0
Intubated_Y                             985
Intubated_nan                            11
Pneumonia_Y                            6644
Pneumonia_nan                             0
Pregnant_Y                               72
Pregnant_nan                           6280
Diabetes_Y                             3104
Diabetes_nan                             63
COPD_Y                                  376
COPD_nan                                 50
Asthma_Y                                207
Asthma_nan                               49
Immunocompromised_Y                     271
Immunocompromised_nan                    53
Hypertension_Y                         3455
Hypertension_nan                         52
Other_Disease_Y                         451
Other_Disease_nan                        78
Cardiovascular_disease_Y                415
Cardiovascular_disease_nan      

In [50]:
# separate into Features X and Target y
y = df.loc[:]['Died_Y']
print(y.shape)
X = df.iloc[:,:-2]  # remove last two columns
print(X.shape)

(9999,)
(9999, 34)


### DataFrame to store test metrics

In [221]:
# dataframe to collect results of all tests
df_results = pd.DataFrame(
    columns=['Test',
             'Description',
             'Accuracy',
             'Sensitivity',
             'Specificity',
             'Pos_Predictive_Val',
             'Neg_Predictive_Val',
             'F1_Score',
             'Matthew_Corr_Coef',
             'AUC'
            ]).astype(
    dtype= {'Test':'object',
            'Description':'object',
            'Accuracy':'float64',
            'Sensitivity':'float64',
             'Specificity':'float64',
             'Pos_Predictive_Val':'float64',
             'Neg_Predictive_Val':'float64',
             'F1_Score':'float64',
             'Matthew_Corr_Coef':'float64',
             'AUC':'float64'
           })

## Part 1.1 Select features and train classifiers

### Part 1.1 Work 

In [51]:
# TODO: 
#   Step 1: Select and apply a filter-based or wrapper-based feature selection method to the data.
#   Step 2: Train a classifier using the selected features. Use 10-fold cross validation.

# Tip: 
#   1. You may find the filter-based or wrapper-based methods you used in HW2 useful. 

#   2. Scikit-learn implement many classifiers, see the comparisons of their 
#   performance and introductions here:
#   https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

#   3. Scikit-learn also implement classifiers with built-in cross validations,
#   for example: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifierCV.html

#### Define wrapper-based feature selection method and classifier

In [52]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV

def perform_feature_selection_wrapper(X,y):
    svc = SVC(kernel="linear")
    rfecv = RFECV(estimator=svc, step=1, cv=5,scoring='accuracy',n_jobs=3)
    return rfecv.fit(X, y)
  
def get_feature_list_wrapper(X, support):
    features = list()
    for i,val in enumerate(support):
        if val:
            features.append(X.columns[i]) # feature was selected by wrapper method
        
    return features

In [56]:
from sklearn.linear_model import RidgeClassifierCV

def build_X_from_features(X, list_features):
    return X.loc[:][list_features]

def train_classifier(X,y,k):
    clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1],cv=k)
    return clf.fit(X, y)

#### Perform feature selection and classification

In [54]:
%%time 
# runtime - 3.4 s (1000 records, 10 fold, 3 cores)
# runtime - 6 min (10000 records, 10 fold, 3 cores)

# feature selection
rfecv = perform_feature_selection_wrapper(X,y)
features_wrapper = get_feature_list_wrapper(X,rfecv.support_)

CPU times: user 1min 48s, sys: 1.27 s, total: 1min 49s
Wall time: 5min 29s


In [58]:
%%time

# classification
X_wrapper = build_X_from_features(X,features_wrapper)

clf = train_classifier(X_wrapper,y,10)

CPU times: user 1.11 s, sys: 39.8 ms, total: 1.15 s
Wall time: 304 ms


#### Calculate metrics
- Accuracy
- Sensitivity & Specificity
- The positive and negative predictive values 
- F1-score
- The Matthews Correlation Coefficient
- AUC (Area under the ROC curve)

In [117]:
from sklearn.metrics import confusion_matrix

def get_confusion_matrix(y_true,y_predict):
    '''
    Calculates the confusion matrix for a given X,y and classifier.
    '''
    # confusion matrix
    cm = confusion_matrix(y_true, y_predict)
    tn, fp, fn, tp = cm.ravel()
    
    print('TN:',tn)
    print('FP:',fp)
    print('FN:',fn)
    print('TP:',tp)
    print('TOTAL:',tp + tn + fp + fn)
    
    return cm

In [208]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score

def calc_classifier_metrics(cm, y_true, y_predict):
    
    tn, fp, fn, tp = cm.ravel()
    
    # accuracy
    acc = (tp + tn) / (tp + tn + fp + fn)
    
    # sensitivity
    sen = tp / (tp + fn)
    
    # specificity
    spec = tn / (tn + fp)
    
    # precision
    prec = tn / (tn + fp)
    
    # positive predictive value
    ppv = tp / (fp + tp) # ?? check correctness
    
    # negative predictive value
    npv = tn / (fn + tn)
    
    # f1 score
    f1 = 2 * (prec * sen) / (prec + sen) 
    
    # matthew correlation coefficient
    mcc = matthews_corrcoef(y_true,y_predict)
    
    # area Under the receiver (AUC)
    auc = roc_auc_score(y_true, y_predict)
    
    # return numpy array  
    metrics = [acc,sen,spec,ppv,npv,f1,mcc,auc]
    return metrics

In [209]:
y_predict = clf.predict(X_wrapper)
cm = get_confusion_matrix(y,y_predict)

TN: 5556
FP: 817
FN: 2013
TP: 1613
TOTAL: 9999


In [210]:
wrapper_results = calc_classifier_metrics(cm,y,y_predict)
wrapper_results

[0.716971697169717,
 0.4448428019856591,
 0.8718029185626863,
 0.6637860082304526,
 0.734046769718589,
 0.589095831961823,
 0.3549254099545765,
 0.6583228602741727]

#### Add metrics to results DataFrame

In [225]:
def add_row(df, test, description, metrics):
    
    data = [test,description] + metrics
    print(data)  
    new_row = pd.Series(data, index = df.columns)
    print(new_row)
    return df.append(new_row,ignore_index=True)

In [226]:
df_results_2 = add_row(df_results,'A','B',wrapper_results )

['A', 'B', 0.716971697169717, 0.4448428019856591, 0.8718029185626863, 0.6637860082304526, 0.734046769718589, 0.589095831961823, 0.3549254099545765, 0.6583228602741727]
Test                         A
Description                  B
Accuracy              0.716972
Sensitivity           0.444843
Specificity           0.871803
Pos_Predictive_Val    0.663786
Neg_Predictive_Val    0.734047
F1_Score              0.589096
Matthew_Corr_Coef     0.354925
AUC                   0.658323
dtype: object


In [227]:
df_results_2

Unnamed: 0,Test,Description,Accuracy,Sensitivity,Specificity,Pos_Predictive_Val,Neg_Predictive_Val,F1_Score,Matthew_Corr_Coef,AUC
0,A,B,0.716972,0.444843,0.871803,0.663786,0.734047,0.589096,0.354925,0.658323


In [228]:
df_results_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Test                1 non-null      object 
 1   Description         1 non-null      object 
 2   Accuracy            1 non-null      float64
 3   Sensitivity         1 non-null      float64
 4   Specificity         1 non-null      float64
 5   Pos_Predictive_Val  1 non-null      float64
 6   Neg_Predictive_Val  1 non-null      float64
 7   F1_Score            1 non-null      float64
 8   Matthew_Corr_Coef   1 non-null      float64
 9   AUC                 1 non-null      float64
dtypes: float64(8), object(2)
memory usage: 208.0+ bytes


### Part 1.2 Work 

In [None]:
# TODO: 
#   Step 1: Select a learning algorithm that performs embedded feature selection. 
#   Step 2: Train a classifier using the selected features. Use 10-fold cross validation.

# Tip: 
#   1. Scikit-learn implement many classifiers, see the comparisons of their 
#   performance and introductions here:
#   https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

#   2. Scikit-learn also implement classifiers with built-in cross validations,
#   for example: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifierCV.html

#### Perform feature selection using embedded method

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

def train_embedded_model(X,y):
    sel = SelectFromModel(RandomForestClassifier())
    return sel.fit(X, y)

In [None]:
X_train_embedded,y_train_embedded,X_test_embedded,y_test_embedded = train_test_split(data,test_size=0.3)

#### Train classifier using selected features from embedded method

### Part 1.3 Work 

In [None]:
# TODO: 
#   Step 1: Select and apply a data imputation method to handle the missing data.
#   Step 2: Apply the  wrapper based feature selection method you used in part 1.1.
#   Step 3: Train a classifier using the selected features. Use the same classifier you used in part 1.1. Use 10-fold cross validation.

# Tip: 
#   1. Sciki-learn implements different imputation methods. Take a look at https://scikit-learn.org/stable/modules/impute.html
#   and use the one you think most appropriate.

### Part 1.4 Work 

In [None]:
# TODO: 
#   Step 1: Apply a data imputation method to eliminate any missing values in the data. Use the same method you used in part 1.3. 
#   Step 2: Train a classifier. Use the same classifier you used in part 1.2. Use 10-fold cross validation.

### Part 1.5 Work 

In [None]:
# TODO: 
#   Step 1: Apply a data imputation method to eliminate any missing values in the data. Use the same method you used in parts 1.3 & 1.4. 
#   Step 2: Select a learning algorithm that performs cost-sensitive learning. 
#   Step 3: Adjust the costs until you find a classifier that maximizes the F1-score, subject to the constraint that it achieves 95% sensitivity for the label ‘Y’.  Use 10-fold cross validation.

# Tip: 
#   1. F1-score: https://en.wikipedia.org/wiki/F1_score
#   2. Scikit-learn supports extending the classifiers to cost-sensitive learning.
#   Take a look at this tutorial: https://machinelearningmastery.com/cost-sensitive-learning-for-imbalanced-classification/

### Part 1.6 Work 

In [None]:
# TODO: 
#   Step 1: Implement a function calculating the weighted average F1-score, following 
#       the steps in the homework problem statement.
#   Step 2: Find a classifier that achieves a weighted average F1-score of at least 0.74 using 10-fold cross validation.

# Tip: 
#   1. F1-score: https://en.wikipedia.org/wiki/F1_score
#   2. Scikit-learn supports extending the classifiers to cost-sensitive learning.
#   Take a look at this tutorial: https://machinelearningmastery.com/cost-sensitive-learning-for-imbalanced-classification/

### Part 1.7 Work

In [None]:
# Tip:
#   1. Scikit-learn implements different evaluation metrics for classifications,
#   see: https://scikit-learn.org/stable/modules/model_evaluation.html

### Part 1.7 Answers

1. Create a ROC plot with the results from parts 1.1 to 1.6. 
2. Create a table with the following performance metrics for the results from parts 1.1 to 1.6:
    * Accuracy
    * Sensitivity & Specificity
    * The positive and negative predictive values
    * F1-score
    * The Matthews Correlation Coefficient
    * AUC (Area under the ROC curve)