In [None]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq #reading parquet files
import matplotlib.pyplot as plt
import os
import seaborn as sns
import scipy
from statsmodels.robust import mad
import collections
from scipy.signal import *
import random
import gc



#used for plotting
import plotly.graph_objects as go
#used for feature engineering(signal processing tools)
from scipy.fftpack import fft
from scipy.signal import welch
#from siml.sk_utils import *
#from siml.signal_analysis_utils import *

from tqdm import tqdm


## Sklearn Libraries
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, classification_report, recall_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import SGDRegressor
from prettytable import PrettyTable


import warnings
warnings.filterwarnings('ignore')

In [None]:
metadata_train = pd.read_csv('../input/vsb-power-line-fault-detection/metadata_train.csv')
metadata_train.head(6)

In [None]:
metadata_train.shape

In [None]:
# Ckecking if each id measurement has 3 phases:
for i in range(0, len(metadata_train),3):
    temp=[]
    for i in range(3):
        temp.append(metadata_train.loc[i]['phase'])
    if (temp!= [0,1,2]):
        print("error")
        break
    
print('There are 3 phases 0, 1, 2 for each id_measurement')   

In [None]:
metadata_train.corr()

In [None]:
for col in metadata_train.columns:
    print("Number of unique values in ", col, "is", metadata_train[col].unique().shape[0])

In [None]:
plt.figure(figsize=(16, 5))
splot = sns.countplot(x='target', data=metadata_train)
for ind, p in enumerate(splot.patches):
  percent = np.round((metadata_train[metadata_train['target']==ind].shape[0]/metadata_train['target'].shape[0])*100, 2)
  splot.annotate(str(metadata_train[metadata_train['target']==ind].shape[0]) + f" ({percent}%)", 
                 (p.get_x()+p.get_width()/3, p.get_height()))
plt.title("Distribution of target classes")
plt.show()

In [None]:
plt.figure(figsize=(16, 5))
splot = sns.countplot(x="phase", data=metadata_train, hue="target")
# Get the total number of signals present in each phase
total_phases = metadata_train[metadata_train['phase']==0].shape[0]
num_points = []
for ind, p in enumerate(splot.patches):
    # Phase=[0,1,2] for indices [0,1,2] and indices [3,4,5] respectively
    phase = ind%3
    # target=[0,1] for indices [0,1], [2,3], [4,5] respectively
    tar = ind//3
    # Store the number of data points for the respective phase and target
    num_points.append(metadata_train.loc[(metadata_train['target']==tar) & (metadata_train['phase']==phase)]['phase'].shape[0])
    # Get the percentage of the number of data points
    num_phase_percent = np.round((num_points[-1]/total_phases)*100, 2)
    # Annotate the bar plot
    splot.annotate(str(num_points[-1])+f" ({num_phase_percent}%)", (p.get_x(), p.get_height()))
plt.title("Distribution of classes with respect to each phase")
plt.show()

In [None]:
#Check if id_measurement is same then is target label 
for i in range(0, len(metadata_train), 3):
    temp1= metadata_train.loc[i]['target']
    #print("temp1= ",temp1)
    f=0
    for j in range(1,3):
        temp2= metadata_train.loc[i+j]['target']
        #print("temp2= ",temp2)
        if temp1!=temp2:
            print(metadata_train.loc[i:i+2])
            f=1
    if f==1:
        print("The same id_measurement does not means the same target value!!! ")
        break 
        

In [None]:
df1 = pq.read_pandas('../input/vsb-power-line-fault-detection/train.parquet', columns=[str(i) for i in range(100)]).to_pandas()

In [None]:
sig_list = metadata_train.head(12).values
plt.figure(figsize=(20,8))
n_rows = 4
n_cols = 1
for ind, val in enumerate(sig_list):
  plt.subplot(n_rows, n_cols, (ind//3)+1)
  plt.plot(df1[str(val[0])].values[::,], label=f"signal_id-{val[0]}, phase-{val[2]}, id_measurement-{val[1]}")
  plt.legend()
  plt.title(f'Power Line Signal with target={val[3]}')
    
  plt.xlabel('Samples')
  plt.ylabel('Amplitude')
plt.show()

**Feature Extraction**

In [None]:
def describe_freq(signal):
    mean = np.mean(signal)
    std = np.std(signal) 
    maxv = np.amax(signal) 
    minv = np.amin(signal) 
    median = np.median(signal)
    skew = scipy.stats.skew(signal)
    kurt = scipy.stats.kurtosis(signal)
    q0 = np.quantile(signal, 0.10)
    q1 = np.quantile(signal, 0.25)
    q3 = np.quantile(signal, 0.75)
    q4 = np.quantile(signal, 0.90)
    mode = scipy.stats.mode(signal)[0][0]
    iqr = scipy.stats.iqr(signal) 
    rmse= np.sqrt(np.mean(signal**2))
    
    return [mean, std, maxv, minv, median, skew, kurt, q0, q1, q3,q4, mode, iqr, rmse]

def describe_entropy(signal):
    counter_values= collections.Counter(signal).most_common()
    proba= [el[1]/len(signal) for el in counter_values]
    entropy=scipy.stats.entropy(proba)
    return entropy 

def describe_crossings(signal):
    zero_crossing_index= np.nonzero(np.diff(np.array(signal)> 0))[0]
    len_zero_crossing= len(zero_crossing_index)
    mean_crossing_index= np.nonzero(np.diff(np.array(signal)> np.nanmean(signal)))[0]
    len_mean_crossing= len(mean_crossing_index)
    return [len_zero_crossing, len_mean_crossing ]


In [None]:
def welch_max_power_and_frequency(signal):
    f, Pxx = welch(signal)
    ix = np.argmax(Pxx)
    strong_count = np.sum(Pxx>2.5)
    avg_amp = np.mean(Pxx)
    sum_amp = np.sum(Pxx)
    std_amp = np.std(Pxx)
    median_amp = np.median(Pxx)
    return [Pxx[ix], f[ix], strong_count, avg_amp, sum_amp, std_amp, median_amp]

In [None]:
f_values, psd_values = welch(df1['0'].values, fs=50)
fig = go.Figure(data=go.Scatter(x = f_values, y = psd_values,mode = 'lines'))
fig.update_layout(
    title="frquency spectrum",
    xaxis_title="Frequency",
    yaxis_title="PSD",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()

In [None]:
def get_features(signal):
    entropy = describe_entropy(signal)
    statistics = describe_freq(signal)
    crossings = describe_crossings(signal)
    powerfreq= welch_max_power_and_frequency(signal)
    return [entropy] + statistics + crossings + powerfreq

In [None]:
fourier_values = fft(df1['0'].values)
fourier_values.shape

In [None]:
#N is number of points in our input signal
N=800000
#T is the time period which is inverseof frequency which is 50Hz
T = 1/50
#fft_values contains the filtered useful values from fft_values_ vector
fourier_values_filtered = 2.0/N * np.abs(fourier_values[N//2:])
#f_values contains the frquency values
f_values = np.linspace(0.0, 1.0/(2.0*T), N//2)



In [None]:
fig = go.Figure(data=go.Scatter(x = f_values, y = fourier_values_filtered,mode = 'lines'))
fig.update_layout(
    title="frquency spectrum",
    xaxis_title="Frequency",
    yaxis_title="Amplitude",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()

In [None]:
def extract_fourier_features(signal,N=800000,T=1/50):
    '''
    converts a signal from time spectrum to frequency spectrum
    and returns only the features required as mentioned above
    '''
    fourier_values = fft(signal)
    fourier_values_filtered = 2.0/N * np.abs(fourier_values[0:N//2])
    return fourier_values_filtered


def filter_features_fourier(features,mph,no_features=8):
    '''
    returns fourier transformed features by extracting peaks and 
    considering only required number of peaks.
    mph-detect peaks that are greater than minimum peak height
    '''
    indices_peaks = detect_peaks(features,mph = mph)
    #print(indices_peaks)
    values = features[indices_peaks]
    if len(values)< no_features:
        return np.append(values , [0]*(no_features-len(values)))
    else:
        return values[:no_features]

In [None]:
#total_features = []
   
#for i in tqdm(range(0,8712)):
 #   fourier_features = np.empty((3,8))
 #   features = []    
 #   signal =  pq.read_pandas('../input/vsb-power-line-fault-detection/train.parquet',columns=[str(i)]).to_pandas()[str(i)].values

    #minimum peak height which can be used to filter fourier features
  #  mph = signal.min() + (signal.max() - np.abs(signal.min()))/10
    
    #fourier features
   # fourier_features_ = extract_fourier_features(signal)
   # fourier_features = filter_features_fourier(fourier_features_,mph)
    
    #features.extend(fourier_features)

In [None]:
#sig_features = np.array(total_features)

#cols=[]
#for i in range(0,8):
 # cols.append('feat'+str(i))
#fourier_df = pd.DataFrame(sig_features, columns =cols)  
#fourier_df.to_csv('../input/fourier-featcsv/'+"fourier_feat.csv", sep=",")

In [None]:
#feature_columns= [ 'signal_id', 'entropy', 'mean', 'std', 'maxv', 'minv', 'median', 'skew', 'kurt', 
 #                        'q0','q1', 'q3','q4', 'mode', 'iqr', 'rmse', 'len_zero_crossing', 'len_mean_crossing',
  #                     'max_amp', 'max_freq', 'strong_amp_count', 'avg_amp', 'sum_amp', 'std_amp', 'median_amp', 'fault']

#data_type = "data"
#df_feature_matrix= pd.DataFrame([], columns= feature_columns)
#for signal_id in metadata_train.signal_id:
 #   signal_0 = data[str(signal_id)]
  #  signal_features= get_features(signal_0)
   # df_features = pd.DataFrame([[signal_id] + signal_features + [metadata_train.target[metadata_train.signal_id == signal_id].values[0]]], columns=feature_columns)

    #df_feature_matrix = df_feature_matrix.append(df_features, ignore_index=True)  # Append Feature Matrix Data Frame

# Store feature matrix to CSV (commented out for this notebook, but left in for example.)
#df_feature_matrix.to_csv('../input/data-get-feautures/'+data_type+"_get_features.csv", sep=",")

In [None]:
df_feature= pd.read_csv('../input/data-get-feautures/data_get_features.csv')
fourier_df=pd.read_csv('../input/fourier-featcsv/fourier_feat.csv')
features= [ 'entropy', 'mean', 'std', 'maxv', 'minv', 'median', 'skew', 'kurt', 
                         'q0','q1', 'q3', 'q4','mode', 'iqr', 'rmse', 'len_zero_crossing', 'len_mean_crossing',
                       'max_amp', 'max_freq', 'strong_amp_count', 'avg_amp', 'sum_amp', 'std_amp', 'median_amp']
fourier_cols= ['feat0','feat1','feat2','feat3','feat4','feat5','feat6','feat7']                       

target= ["fault"]
df= df_feature[features]
fouri_df= fourier_df[fourier_cols]
data_features= pd.concat([df,fouri_df], axis=1)
data_features["fault"]= df_feature[target]
data_features

In [None]:
data_features.shape

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
blues = ["#66D7EB", "#51ACC5", "#3E849E", "#2C5F78", "#1C3D52", "#0E1E2B"]
cor = data_features.corr()
f, ax = plt.subplots(figsize=(14, 8), dpi= 120, facecolor='w', edgecolor='k')
sns.heatmap(cor, cmap=blues)

In [None]:
# Detail
sns.pairplot(data_features[["std", 'feat0','entropy',  "fault"]], hue="fault",  height=3, diag_kind="kde", diag_kws=dict(shade=True, bw=.05, vertical=False) )


**Machine Learning Algorithms**

In [None]:
random_state = 1
class Create_ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        no_class = len(np.unique(y))

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, 
                                     random_state = random_state).split(X, y))

        train_proba = np.zeros((X.shape[0], no_class))
        test_proba = np.zeros((T.shape[0], no_class))
        
        train_pred = np.zeros((X.shape[0], len(self.base_models)))
        test_pred = np.zeros((T.shape[0], len(self.base_models)* self.n_splits))
        f1_scores = np.zeros((len(self.base_models), self.n_splits))
        recall_scores = np.zeros((len(self.base_models), self.n_splits))
        clf_=[]
        # PrettyTable
        tb = PrettyTable()
        tb.field_names = ['Model', 'CV-', 'recall', 'f1_score', 'mcc']
        #DataFrame 
        Frame = pd.DataFrame()

        test_col = 0
        for i, clf in enumerate(self.base_models):
            
            for j, (train_idx, valid_idx) in enumerate(folds):
                
                X_train = X[train_idx]
                Y_train = y[train_idx]
                X_valid = X[valid_idx]
                Y_valid = y[valid_idx]
                
                clf.fit(X_train, Y_train)
                
                valid_pred = clf.predict(X_valid)
                recall  = recall_score(Y_valid, valid_pred, average='macro')
                f1 = f1_score(Y_valid, valid_pred, average='macro')
                mcc = matthews_corrcoef(Y_valid, valid_pred)
                
                recall_scores[i][j] = recall
                f1_scores[i][j] = f1
                
                train_pred[valid_idx, i] = valid_pred
                test_pred[:, test_col] = clf.predict(T)
                test_col += 1
                
                ## Probabilities
                valid_proba = clf.predict_proba(X_valid)
                train_proba[valid_idx, :] = valid_proba
                test_proba  += clf.predict_proba(T)
                
                #print( "Model- {} and CV- {} recall: {}, f1_score: {}, mcc: {}".format(i, j, recall, f1, mcc))
                tb.add_row([i, j, recall, f1, mcc])
                new_row= {'Model':i, 'CV':j, 'recall':recall, 'f1':f1, 'mcc':mcc}
                Frame = Frame.append(new_row, ignore_index=True)
                clf_.append(clf)
            test_proba /= self.n_splits
            
        return train_proba, test_proba, train_pred, test_pred, clf_, Frame

In [None]:
#Unbalanced 
# LogisticRegression
params_log = {'C': [10**x for x in range(-4,5)],
          'l1_ratio': [0, 0.1, 0.3, 0.5, 0.7, 0.9, 1]}
log_clf = LogisticRegression(penalty='elasticnet', solver='saga', random_state=42)

# Perform stratified 5-fold cross validation
rand_log_clf = RandomizedSearchCV(log_clf, param_distributions=params_log, random_state=42, return_train_score=True,
                              n_jobs=-1)

# Random Forest
params_rf = {'n_estimators': [10, 50, 100, 500, 1000], 
          'max_depth': [2, 3, 4, 5, 6],
          'min_samples_split': [0.02, 0.04, 0.08, 0.16, 0.32, 0.50]}
rf_clf = RandomForestClassifier(random_state=42)
rand_rf_clf = RandomizedSearchCV(rf_clf, param_distributions=params_rf, random_state=42, return_train_score=True,
                              n_jobs=-1)

# LightGBM
params_lgb = {'n_estimators': [10, 50, 100, 500, 1000], 
          'max_depth': [2, 3, 4, 5, 6],
          'learning_rate': [1e-2, 1e-1, 0.5, 0.9],
          'reg_alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2], 
          'reg_lambda': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}
lgb_clf = lgb.LGBMClassifier(random_state=42,
                             importance_type='gain')
rand_lgb_clf = RandomizedSearchCV(lgb_clf, param_distributions=params_lgb, random_state=42, return_train_score=True,
                              n_jobs=-1)

# XGBoost
params_xgb = {'n_estimators': [10, 50, 100, 500, 1000], 
          'max_depth': [2, 3, 4, 5, 6],
          'learning_rate': [1e-2, 1e-1, 0.5, 0.9],
          'reg_alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2], 
          'reg_lambda': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}
xgb_clf = xgb.XGBClassifier(random_state=42)
rand_xgb_clf = RandomizedSearchCV(xgb_clf, param_distributions=params_xgb, random_state=42, return_train_score=True,
                              n_jobs=-1)

In [None]:
# Balacend 

# LogisticRegression
params_log_b = {'C': [10**x for x in range(-4,5)],
          'l1_ratio': [0, 0.1, 0.3, 0.5, 0.7, 0.9, 1]}
log_clf_b = LogisticRegression(penalty='elasticnet', solver='saga',class_weight='balanced', random_state=42)

# Perform stratified 5-fold cross validation
rand_log_clf_b = RandomizedSearchCV(log_clf_b, param_distributions=params_log, random_state=42, return_train_score=True,
                              n_jobs=-1)

# Random Forest
params_rf_b = {'n_estimators': [10, 50, 100, 500, 1000], 
          'max_depth': [2, 3, 4, 5, 6],
          'min_samples_split': [0.02, 0.04, 0.08, 0.16, 0.32, 0.50]}
rf_clf_b = RandomForestClassifier(random_state=42, class_weight='balanced')
rand_rf_clf_b = RandomizedSearchCV(rf_clf_b, param_distributions=params_rf, random_state=42, return_train_score=True,
                              n_jobs=-1)

# LightGBM
params_lgb_b = {'n_estimators': [10, 50, 100, 500, 1000], 
          'max_depth': [2, 3, 4, 5, 6],
          'learning_rate': [1e-2, 1e-1, 0.5, 0.9],
          'reg_alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2], 
          'reg_lambda': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}
lgb_clf_b = lgb.LGBMClassifier(random_state=42, class_weight='balanced', 
                             importance_type='gain')
rand_lgb_clf_b = RandomizedSearchCV(lgb_clf_b, param_distributions=params_lgb_b, random_state=42, return_train_score=True,
                              n_jobs=-1)

# XGBoost
params_xgb_b = {'n_estimators': [10, 50, 100, 500, 1000], 
          'max_depth': [2, 3, 4, 5, 6],
          'learning_rate': [1e-2, 1e-1, 0.5, 0.9],
          'reg_alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2], 
          'reg_lambda': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}
xgb_clf_b = xgb.XGBClassifier(random_state=42, class_weight='balanced')
rand_xgb_clf_b = RandomizedSearchCV(xgb_clf_b, param_distributions=params_xgb_b, random_state=42, return_train_score=True,
                              n_jobs=-1)


In [None]:
base_models = [rand_log_clf, rand_rf_clf, rand_lgb_clf,rand_xgb_clf , rand_log_clf_b, rand_rf_clf_b, rand_lgb_clf_b,rand_xgb_clf_b ]
n_splits=5
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models) 
       
data_features.fillna(999, inplace=True)

X = data_features.drop(['fault'], axis=1)
Y = data_features['fault'].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.33, random_state=42)
x_train.shape


In [None]:
train_proba, test_proba, train_pred, test_pred, clf , table= lgb_stack.predict(x_train, y_train, x_test)

In [None]:
table.loc[table['Model']==0.0, 'Model'] = 'Ub_LogisticRegression'
table.loc[table['Model']==1.0, 'Model'] = 'Ub_RandomForest'
table.loc[table['Model']==2.0, 'Model'] = 'Ub_LightGBM'
table.loc[table['Model']==3.0, 'Model'] = 'Ub_XGBoost'
table.loc[table['Model']==4.0, 'Model'] = 'B_LogisticRegression'
table.loc[table['Model']==5.0, 'Model'] = 'B_RandomForest'
table.loc[table['Model']==6.0, 'Model'] = 'B_LightGBM'
table.loc[table['Model']==7.0, 'Model'] = 'B_XGBoost'
table

In [None]:
cols=['Ub_LogisticRegression', 'Ub_RandomForest', 'Ub_LightGBM', 'Ub_XGBoost', 'B_LogisticRegression', 'B_RandomForest', 'B_LightGBM', 'B_XGBoost']
train_pred_df = pd.DataFrame(train_pred, columns =cols)  
train_pred_df

In [None]:
print('\nPerformance Metrics after Weighted XGBoost Cross Validation')
print('1. The F-1 score of the model {}\n'.format(f1_score(y_train, train_pred_df['B_XGBoost'], average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(y_train, train_pred_df['B_XGBoost'], average='macro')))
print('3. The Matthews Correlation Coefficient: {}\n'.format(matthews_corrcoef(y_train, train_pred_df['B_XGBoost'])))
print('4. Classification report \n {} \n'.format(classification_report(y_train, train_pred_df['B_XGBoost'])))
print('5. Confusion matrix \n {} \n'.format(confusion_matrix(y_train, train_pred_df['B_XGBoost'])))

In [None]:
# histogram of important features
imp = clf[39].best_estimator_.feature_importances_
imp, features = zip(*sorted(zip(imp, features)))
blues = ["#66D7EB", "#51ACC5", "#3E849E", "#2C5F78", "#1C3D52", "#0E1E2B"]
f, ax = plt.subplots(figsize=(14, 6), dpi= 120, facecolor='w', edgecolor='k')
plt.barh(range(len(features)), imp, color=blues[1], align="center")
plt.yticks(range(len(features)), features)
plt.xlabel("Importance of Features")
plt.ylabel("Features")
plt.title("Importance of Each Feature in Classifier Model")

In [None]:
# histogram of predicted probabilities
blues = ["#66D7EB", "#51ACC5", "#3E849E", "#2C5F78", "#1C3D52", "#0E1E2B"]
f, ax = plt.subplots(figsize=(14, 6), dpi= 120, facecolor='w', edgecolor='k')
nclasses = 2
titles = ["Probabilities for No Partial Discharge Fault Present", "Probabilities for Partial Discharge Fault Present"]
for i in range(nclasses):
    plt.subplot(1, nclasses, i+1)
    plt.hist(train_proba[:, i], bins=50, histtype='bar', rwidth=0.95, color=blues[1])
    plt.xlim(0,1)
    plt.title(titles[i])
    plt.xlabel('Probability')
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
y = label_binarize(y_train, classes=[0, 1])
_, _, th1 = roc_curve(y[:, 0], train_proba[:, 0])
_, _, th2 = roc_curve(y[:, 0], train_proba[:, 1])
print('\nMedian Detection Thresholds for Fault Detection')  # use for setting reprediction thresholds
print(np.median(th1))
print(np.median(th2))

In [None]:
columns=[] 
for i in range(40):
  col= 'y_pred'+str(i)
  columns.append(col)
e_dataframe = pd.DataFrame(test_pred, columns =columns)  
e_dataframe['y_test'] =y_test.astype(float)
e_dataframe

In [None]:

print('\nPerformance Metrics after Weighted Unbalanced XGBoost Cross Validation')
print('1. The F-1 score of the model {}\n'.format(f1_score(e_dataframe.iloc[:,15], e_dataframe.iloc[:,-1], average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(e_dataframe.iloc[:,15], e_dataframe.iloc[:,-1], average='macro')))
print('3. The Matthews Correlation Coefficient: {}\n'.format(matthews_corrcoef(e_dataframe.iloc[:,15], e_dataframe.iloc[:,-1])))
print('4. Classification report \n {} \n'.format(classification_report(e_dataframe.iloc[:,15], e_dataframe.iloc[:,-1])))
print('5. Confusion matrix \n {} \n'.format(confusion_matrix(e_dataframe.iloc[:,15], e_dataframe.iloc[:,-1])))


In [None]:

model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=50,
                                learning_rate=0.1, n_estimators=500, max_depth=6,
                                bagging_fraction=0.9, feature_fraction=0.9, reg_lambda=0.2)

base_models = [model]
n_splits = 5
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models) 

In [None]:
data_features.fillna(999, inplace=True)

X = data_features.drop(['fault'], axis=1)
Y = data_features['fault'].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.33, random_state=42)
x_train.shape

In [None]:
train_proba, test_proba, train_pred, test_pred, clf , table= lgb_stack.predict(x_train, y_train, x_test)

In [None]:
table

In [None]:
print('\nPerformance Metrics after Weighted Random Forest Cross Validation')
print('1. The F-1 score of the model {}\n'.format(f1_score(y_train, train_pred, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(y_train, train_pred, average='macro')))
print('3. The Matthews Correlation Coefficient: {}\n'.format(matthews_corrcoef(y_train, train_pred)))
print('4. Classification reporct \n {} \n'.format(classification_report(y_train, train_pred)))
print('5. Confusion matrix \n {} \n'.format(confusion_matrix(y_train, train_pred)))

In [None]:
df_test = pd.DataFrame() 
test_pred = np.median(test_pred).astype(int)
df_test['y_test'] =y_test.astype(int)
df_test["fault"] = test_pred
# Make Submission File
submission_filename = "../input/submission/prediction_submission_cv.csv"

# Commented out in this notebook but left in as an example to create a submission file
#f_o = open(submission_filename, "w+")
#f_o.write("signal_id,target\n")
#for idx in range(len(df_test)):

 #   signal_id = df_feature["signal_id"][idx]
 #   fault = df_test["fault"][idx]
 #   f_o.write(str(signal_id)+","+str(fault)+"\n")
#f_o.close()

In [None]:
df_test=pd.read_csv('../input/submission/prediction_submission_cv.csv')
df_test