In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import time
import pickle
import time
from sklearn.neighbors import LocalOutlierFactor

## Final_function - Returns prediction

In [2]:
def final_function(X):
    
    mm          = pickle.load(open('mm_normalizer.pkl', 'rb'))
    KNN_imputer = pickle.load(open('imputer_knn.pkl', 'rb'))
    lof         = pickle.load(open('lof.pkl', 'rb'))
    top_15      = pd.read_csv('top_15.csv')    
    clf_best    = pickle.load(open('models//xgb.pkl', 'rb'))
    
    features_to_remove_1 = ['bq_000','br_000']
    features_to_remove_2 = ['ah_000','am_0','an_000','ao_000','aq_000','ba_001','ba_002','ba_003','ba_004','ba_005','ba_006',
                            'bb_000','bg_000','bh_000','bi_000','bj_000','bl_000','bm_000','bn_000','bo_000','bp_000','bt_000',
                            'bu_000','bv_000','by_000','cc_000','cf_000','ci_000','cn_004','cn_005','co_000','cq_000','cs_005',
                            'dc_000','dn_000','dp_000','dt_000','ed_000','ee_000','ee_001','ee_002','ee_003','ee_004']
    
    
    X = X.replace('na',np.nan)
    X = X.drop(features_to_remove_1,axis=1)
    
    
    # NORMALIZING FOLLOWED BY MISSING VALUE IMPUTATION FOLLOWED BY REMOVING HIGHLY CORRELATED FEATURES
    X_norm = pd.DataFrame(mm.transform(X),columns=X.columns)
    X_imputed = pd.DataFrame(KNN_imputer.transform(X_norm),columns=X_norm.columns)
    X_imputed = X_imputed.drop(features_to_remove_2,axis=1)
    
    
    # FEATURE ENGINEERING
    #1
    null_count = X.isnull().sum(axis=1)
    null_count = null_count.to_numpy()    
    X_imputed['null_count'] = null_count
    
    #2
    lof_feature= lof.predict(X_imputed)    
    X_imputed['lof'] = lof_feature
    
    #3
    quantiles = {}
    for features in top_15['Features']:
        quantiles[features] = np.percentile(X_imputed[features],[25,75])
        
    for feature in top_15['Features']:
        feature_val = X_imputed[feature]
        new_feature_25 = feature_val - quantiles[feature][0]
        new_feature_75 = feature_val - quantiles[feature][1]            

        X_imputed[feature+'_25'] = new_feature_25
        X_imputed[feature+'_75'] = new_feature_75
    
    
    ## BEST CLASSIFIER 
    y_pred_final = clf_best.predict(X_imputed)

    return y_pred_final

## Performance Evaluation

### Importing Datapoints

In [3]:
positive_datapoint = pd.read_csv('positive_datapoint.csv',index_col=0)
negative_datapoint = pd.read_csv('negative_datapoint.csv',index_col=0)

In [5]:
positive_datapoint

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,1,72504,na,1594,1052,0,0,0,244,178226,...,1432098,372252,527514,358274,332818,284178,3742,0,0,0


In [6]:
negative_datapoint

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,0,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0


In [7]:
positive_datapoint = positive_datapoint.drop(['class'],axis=1)
negative_datapoint = negative_datapoint.drop(['class'],axis=1)

### Checking Prediction for positive Datapoint

In [10]:
start = time.time()

prediction = final_function(positive_datapoint)
if prediction[0]==0:
    print('Failure is not for the APS system')
else:
    print('There is a failure in the APS system')
    
end = time.time()
print(f"Runtime of the program is {end - start}")

There is a failure in the APS system
Runtime of the program is 0.41842079162597656


### Checking Prediction for negative Datapoint

In [11]:
start = time.time()

prediction = final_function(negative_datapoint)
if prediction[0]==0:
    print('Failure is not for the APS system')
else:
    print('There is a failure in the APS system')
    
end = time.time()
print(f"Runtime of the program is {end - start}")    

Failure is not for the APS system
Runtime of the program is 0.38103389739990234
