In [1]:
import pandas as pd

import numpy as np

import scipy.stats as s

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.impute import SimpleImputer

from Gaussian_Naive_Bayes import gaussian_nb

from imblearn.over_sampling import SMOTE

from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score,precision_score,recall_score

In [2]:
data = pd.read_csv("./aps_failure_training_set.csv")

In [3]:
class gaussian_nb_scania(gaussian_nb):
    
    def __init__(xerox_copy, data, non_missing_threshold, split_ratio, apply_pca_or_not, n_principal_components, data_check):
        
        data.replace(to_replace='na', value=np.nan, inplace = True)
        
        data.dropna(axis = 1, inplace = True, thresh=int(non_missing_threshold *data.shape[0]))
        
        xerox_copy.data_labels= data['class']
        
        imputer = SimpleImputer()

        data_array = imputer.fit_transform(X=data.iloc[:,1:])
        
        data_columns = data.columns
        
        data = pd.DataFrame(data = data_array, columns = data_columns[1:])
        
        if data_check == True:
            
            xerox_copy.data_unique_value_check =  xerox_copy.data_drop_column(data)
        
        np_array_list = list()

        
        for column in data.columns:
    
            data[column] = pd.qcut(x=data[column],q=10,duplicates='drop').cat.codes
        
            np_array_list.append(np.eye(10,10)[data[column]])
            
        data_array = np.concatenate(np_array_list, axis=1)
        
        data = pd.DataFrame(data=data_array)
        
        data['class'] = xerox_copy.data_labels
        
        xerox_copy.data = data
        
        super().__init__(features=data.iloc[:,0:data.shape[1]-1], labels=data['class'], data_split_ratio=split_ratio,
                         apply_pca=True, n_components= n_principal_components)
        
    def data_drop_column(xerox_copy, data):
            
        for column in data.columns:
                
            if len(data[column].unique()) == 1:
                    
                data.drop(columns = [column], inplace=True)
                    
            else:
                    
                continue

In [4]:
naive_bayes_configs = dict()

for non_na_thresh in np.arange(0.7,1,0.1):
    
    for n_comp in np.arange(20,170,50):
        
        naive_bayes_configs[(non_na_thresh, n_comp)] = gaussian_nb_scania(data, non_na_thresh, (0.8,0.2,0.0), True, n_comp, True)

In [5]:
naive_bayes_configs

{(0.7, 20): <__main__.gaussian_nb_scania at 0x1e6f9895c18>,
 (0.7, 70): <__main__.gaussian_nb_scania at 0x1e6f6dddc88>,
 (0.7, 120): <__main__.gaussian_nb_scania at 0x1e6f0e01978>,
 (0.7999999999999999, 20): <__main__.gaussian_nb_scania at 0x1e688d65668>,
 (0.7999999999999999, 70): <__main__.gaussian_nb_scania at 0x1e6fa8392e8>,
 (0.7999999999999999, 120): <__main__.gaussian_nb_scania at 0x1e6f975f128>,
 (0.8999999999999999, 20): <__main__.gaussian_nb_scania at 0x1e6fd0acb38>,
 (0.8999999999999999, 70): <__main__.gaussian_nb_scania at 0x1e6834a40f0>,
 (0.8999999999999999, 120): <__main__.gaussian_nb_scania at 0x1e681c0f080>,
 (0.9999999999999999, 20): <__main__.gaussian_nb_scania at 0x1e6f9f50eb8>,
 (0.9999999999999999, 70): <__main__.gaussian_nb_scania at 0x1e6f44ca4e0>,
 (0.9999999999999999, 120): <__main__.gaussian_nb_scania at 0x1e681334860>}

In [6]:
naive_bayes =list()

cv_data_list = list()


for obj in naive_bayes_configs.values():
    
    X_resampled, y_resampled = SMOTE(sampling_strategy='minority').fit_sample(X = obj.X_new , y=data['class'])
    
    data_resampled = pd.DataFrame(data = X_resampled)
    
    data_resampled['class'] = data['class']
    
    train_data, cv_data, test_data = obj.data_splitting(data_resampled)
    
    cv_data_list.append(cv_data)
     
    naive_bayes.append(GaussianNB().fit(X=np.array(train_data.iloc[:,0:train_data.shape[1]-1]), y = train_data['label']))

In [7]:
metrics = dict()

for obj,cv_data,config in tuple(zip(naive_bayes,cv_data_list,naive_bayes_configs.keys())):
    
    predicted_category = obj.predict(X=np.array(cv_data.iloc[:,0:cv_data.shape[1]-1]))
    
    acc = accuracy_score(y_true=np.array(cv_data['label']),y_pred=predicted_category)
    
    precision = precision_score(y_true=np.array(cv_data['label']),y_pred=predicted_category, pos_label='pos',zero_division=1)
    
    recall = recall_score(y_true=np.array(cv_data['label']),y_pred=predicted_category, pos_label='pos',zero_division=1)
    
    metrics[config] = {'accuracy':acc,'precision':precision,'recall':recall}

In [8]:
metrics

{(0.7, 20): {'accuracy': 0.93175, 'precision': 0.0, 'recall': 1.0},
 (0.7, 70): {'accuracy': 0.9478333333333333, 'precision': 0.0, 'recall': 1.0},
 (0.7, 120): {'accuracy': 0.9490833333333333, 'precision': 0.0, 'recall': 1.0},
 (0.7999999999999999, 20): {'accuracy': 0.93675,
  'precision': 0.0,
  'recall': 1.0},
 (0.7999999999999999, 70): {'accuracy': 0.9498333333333333,
  'precision': 0.0,
  'recall': 1.0},
 (0.7999999999999999, 120): {'accuracy': 0.95475,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 20): {'accuracy': 0.9345833333333333,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 70): {'accuracy': 0.9474166666666667,
  'precision': 0.0,
  'recall': 1.0},
 (0.8999999999999999, 120): {'accuracy': 0.95125,
  'precision': 0.0,
  'recall': 1.0},
 (0.9999999999999999, 20): {'accuracy': 0.9145833333333333,
  'precision': 0.0,
  'recall': 1.0},
 (0.9999999999999999, 70): {'accuracy': 0.9145833333333333,
  'precision': 0.0,
  'recall': 1.0},
 (0.9999999999999999

# Test Data

In [9]:
test_data = pd.read_csv("./aps_failure_test_set.csv")

In [13]:
test_obj = gaussian_nb_scania(test_data, 0.9, (1,0.0,0.0), True, 120, True)

In [14]:
test_metrics = dict()

predicted_category = naive_bayes[5].predict(X=np.array(test_obj.X_new))

acc = accuracy_score(y_true=np.array(test_data['class']),y_pred=predicted_category)

precision = precision_score(y_true=np.array(test_data['class']),y_pred=predicted_category,
                                 pos_label='pos',zero_division=1)

recall = recall_score(y_true=np.array(test_data['class']),y_pred=predicted_category,
                                 pos_label='pos',zero_division=1)

test_metrics['0.8, 120'] = {'accuracy':acc,'precision':precision,'recall':recall}


In [15]:
test_metrics

{'0.8, 120': {'accuracy': 0.956625,
  'precision': 0.32098765432098764,
  'recall': 0.7626666666666667}}