In [26]:
import numpy as np
import pandas as pd
import os
import copy
import matplotlib.pylab as plt
import seaborn as sbn
import pickle
import sys

from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from itertools import permutations, combinations
from statsmodels.discrete.discrete_model import Logit

## Loading datafiles for classification model training and testing

In [27]:
datafile_training= 'data_training.csv'
datafile_test = 'data_test_.csv'
data_training = pd.read_csv(datafile_training)
data_test = pd.read_csv(datafile_test)

In [1]:
data_training.head(2)

NameError: name 'data_training' is not defined

In [29]:
data_training.describe()

Unnamed: 0,obs12,prd12,stat12,pval12,obs13,prd13,stat13,pval13,obs14,prd14,...,obs46,prd46,stat46,pval46,obs56,prd56,stat56,pval56,leak,leak_num
count,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,...,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0,59000.0
mean,0.10986,-13.881032,0.892881,0.005166076,0.038555,-2640.91383,0.98756,0.004164638,0.063524,-3.58792,...,-0.020262,-294.486814,0.992114,0.004390056,0.02855,-591.112129,0.992111,0.004271135,0.991525,59.415254
std,0.255843,17.702161,0.199254,0.05587282,0.254073,395.755388,0.098186,0.05236559,0.252763,11.327108,...,0.256843,337.134192,0.085318,0.05420572,0.250598,1831.454569,0.085247,0.05301318,0.091667,34.197952
min,-0.963119,-70.08716,0.0314,0.0,-1.00702,-3519.277949,0.031667,0.0,-0.996001,-56.313202,...,-1.141242,-1632.841359,0.030867,0.0,-0.998205,-12950.62691,0.0276,0.0,0.0,0.0
25%,-0.063521,-18.413135,0.923188,0.0,-0.132144,-2716.718203,1.0,0.0,-0.107776,-3.581137,...,-0.195376,-285.601385,1.0,0.0,-0.139834,-249.961328,1.0,0.0,1.0,30.0
50%,0.10893,-15.298309,0.986667,1.110223e-16,0.037056,-2659.826761,1.0,0.0,0.063136,-1.388345,...,-0.019899,-179.809505,1.0,0.0,0.029224,-115.842987,1.0,0.0,1.0,59.5
75%,0.284371,-12.216419,1.0,5.551115e-16,0.211015,-2605.983649,1.0,3.330669e-16,0.235016,1.1868,...,0.155099,-133.288746,1.0,3.330669e-16,0.197411,-67.366249,1.0,3.330669e-16,1.0,89.0
max,1.116784,86.151853,1.0,0.9979465,1.130042,0.17638,1.0,0.9976783,1.131419,11.459782,...,1.010221,0.109683,1.0,0.9984091,1.163324,0.15909,1.0,0.999763,1.0,118.0


In [30]:
# Selecting the featutes like means, ks_stat or ks_pval

def feature_selection(data_df,features,output_col):
    
    columns = []
    for feature in features:
        col_temp = [col for col in data_df.columns if feature in col]
        columns.extend(col_temp)
    columns.extend(output_col)
    data_df_select = data_df[columns]
    return data_df_select

## Logistic Regression Model

In [31]:
def logistic_reg(output_type,
                 train_df_full,
                 test_df_full,
                 features): # 'obs','prd','stat','pval'
    
    if output_type == 'binary':
        output_col = ['leak']
    else:
        output_col = ['leak_num']
        
    train_df = feature_selection(train_df_full,features,output_col)
    test_df = feature_selection(test_df_full,features,output_col)

    xdata=np.array(train_df)[:,:-2]
    if output_type=='binary':
        ydata=np.array(train_df)[:,-2]
    else:
        ydata=np.array(train_df)[:,-1]
    
    xdata_test=np.array(test_df)[:,:-2]
    if output_type=='binary':
        ydata_test=np.array(test_df)[:,-2]
    else:
        ydata_test=np.array(test_df)[:,-1]
    
    scaler = StandardScaler()
    xtrain_norm = scaler.fit_transform(xdata)
    xtest_norm = scaler.transform(xdata_test)
    
    if output_type=='binary':
        model=LogisticRegression()
    else:
        model=LogisticRegression(multi_class='multinomial',class_weight='balanced',max_iter=5000)
        
    model.fit(xtrain_norm,ydata)
    beta = model.coef_
    ypred = model.predict(xtest_norm)
    report = classification_report(ydata_test,ypred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    conf_mat = confusion_matrix(ydata_test,ypred)
    
    return conf_mat, beta,report_df

In [32]:
def randomforest(output_type,
                 train_df_full,
                 test_df_full,
                 features): # 'obs','prd','stat','pval'
    
    if output_type == 'binary':
        output_col = ['leak']
    else:
        output_col = ['leak_num']
        
    train_df = feature_selection(train_df_full,features,output_col)
    test_df = feature_selection(test_df_full,features,output_col)

    xdata=np.array(train_df)[:,:-2]
    if output_type=='binary':
        ydata=np.array(train_df)[:,-2]
    else:
        ydata=np.array(train_df)[:,-1]
    
    xdata_test=np.array(test_df)[:,:-2]
    if output_type=='binary':
        ydata_test=np.array(test_df)[:,-2]
    else:
        ydata_test=np.array(test_df)[:,-1]
    
    scaler = StandardScaler()
    xtrain_norm = scaler.fit_transform(xdata)
    xtest_norm = scaler.transform(xdata_test)
    
    model=RandomForestClassifier()
    
#     if output_type=='binary':
#         model=LogisticRegression()
#     else:
#         model=LogisticRegression(multi_class='multinomial',class_weight='balanced',max_iter=5000)
        
    model.fit(xtrain_norm,ydata)
    #beta = model.coef_
    ypred = model.predict(xtest_norm)
    report = classification_report(ydata_test,ypred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    conf_mat = confusion_matrix(ydata_test,ypred)
    
    return conf_mat, report_df

## Results for Prediction Error based approach

* Test Set with normal/expected demand based 'no leak data' plus leak data

In [33]:
lrcmat_mean,lrbeta_mean,lrreport_mean = logistic_reg('multi',
                                                                    data_training,
                                                                    data_test,
                                                                   ['obs','prd'])

In [9]:
np.set_printoptions(threshold=sys.maxsize)

In [10]:
[str(0)]+[str(i) for i in range(7,98)]

['0',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97']

In [34]:
#lrcmat_pd = pd.DataFrame(data=lrcmat_mean,columns=[str(0)]+[str(i) for i in range(1,430)],index=[str(0)]+[str(i) for i in range(1,430)])
lrcmat_pd = pd.DataFrame(data=lrcmat_mean,columns=[str(0)]+[str(i) for i in range(1,10)]+[str(i) for i in range(11,119)],index=[str(0)]+[str(i) for i in range(1,10)]+[str(i) for i in range(11,119)])

In [35]:
lrcmat_pd.to_csv('log_reg_conf_mat.csv')

In [36]:
lrreport_mean.to_csv('log_reg_acc_report.csv')

In [30]:
cmat_mean, report_mean = randomforest('multi',
                                                                    data_training,
                                                                    data_test,
                                                                   ['obs','prd'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
report_mean

Unnamed: 0,precision,recall,f1-score,support
0.0,1.0,0.92,0.958333,50.0
1.0,0.0,0.0,0.0,50.0
2.0,0.0,0.0,0.0,50.0
3.0,0.0,0.0,0.0,50.0
4.0,0.0,0.0,0.0,50.0
5.0,0.0,0.0,0.0,50.0
6.0,0.0,0.0,0.0,50.0
7.0,0.0,0.0,0.0,50.0
8.0,0.0,0.0,0.0,50.0
9.0,0.0,0.0,0.0,50.0
