In [0]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [0]:
###########################
# Load Data from CSV File
# https://archive.ics.uci.edu/ml/datasets/Electrical+Grid+Stability+Simulated+Data+
###########################
url = 'https://raw.githubusercontent.com/shaukat-abidi/teaching_dataset/master/Data_for_UCI_named.csv'
df_csv = pd.read_csv(url)

In [3]:
df_csv.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable


In [4]:
print(df_csv.shape)
print('-'*14)
print(df_csv.columns.to_list())

(10000, 14)
--------------
['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2', 'g3', 'g4', 'stab', 'stabf']


In [0]:
def ret_int(str_label):
  if str_label == 'unstable':
    return 0
  else:
    return 1

In [0]:
# Adding new column (stabf_integer)
df_csv['stabf_integer'] = df_csv['stabf'].apply(lambda str_lab: ret_int(str_lab))

In [7]:
df_csv.head(1)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,stabf_integer
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable,0


In [0]:
# Dataframe
# col_list = ['tau1', 'tau2', 'p1']
# df_csv[col_list]

In [0]:
################################
# Separate Data and Labels
################################

data_cols = ['tau1', 'tau2', 'tau3', 'tau4', 'p1', 'p2', 'p3', 'p4', 'g1', 'g2', 'g3', 'g4', 'stab']
lab_col = ['stabf_integer']

df_X = df_csv[data_cols]
df_Y = df_csv[lab_col]

In [9]:
df_X.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957


In [10]:
df_Y.head(2)

Unnamed: 0,stabf_integer
0,0
1,1


In [0]:
# Dataframe to Numpy Arrays
X = df_X.to_numpy(copy=True)
Y = df_Y.to_numpy(copy=True)

In [12]:
print(type(df_X))
print(type(X))
print(type(Y))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [13]:
print(X.shape)
print(Y.shape)

(10000, 13)
(10000, 1)


In [14]:
print(X[0:2,:])

[[ 2.95906002e+00  3.07988520e+00  8.38102539e+00  9.78075443e+00
   3.76308477e+00 -7.82603631e-01 -1.25739483e+00 -1.72308631e+00
   6.50456461e-01  8.59578106e-01  8.87444921e-01  9.58033988e-01
   5.53474892e-02]
 [ 9.30409723e+00  4.90252411e+00  3.04754073e+00  1.36935736e+00
   5.06781210e+00 -1.94005843e+00 -1.87274169e+00 -1.25501199e+00
   4.13440568e-01  8.62414076e-01  5.62139051e-01  7.81759911e-01
  -5.95746433e-03]]


In [15]:
df_csv.head(2)

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf,stabf_integer
0,2.95906,3.079885,8.381025,9.780754,3.763085,-0.782604,-1.257395,-1.723086,0.650456,0.859578,0.887445,0.958034,0.055347,unstable,0
1,9.304097,4.902524,3.047541,1.369357,5.067812,-1.940058,-1.872742,-1.255012,0.413441,0.862414,0.562139,0.78176,-0.005957,stable,1


In [16]:
print(Y[0:2,:])

[[0]
 [1]]


In [17]:
tot_x, tot_cols = df_X.shape
print(tot_x)
print(0.7 * tot_x, 0.3 * tot_x)

# X[0:7000,:] --- > Training X
# Y[0:7000,:] --- > Training Y

# X[7000:,:] --- > Testing X
# Y[7000:,:] ---- > Testing Y

10000
7000.0 3000.0


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3)
# My recommendation

#
# Do IT YOURSELF USING SLICES
#

In [19]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7000, 13) (7000, 1)
(3000, 13) (3000, 1)


In [0]:
#X_train[0:2,:]
#y_train[0:2,:]


In [0]:
# Applying Naive Bayes Model on our Train SET (X_train, y_train)
clf_nb = GaussianNB()

In [21]:
clf_nb.fit(X_train, y_train.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
#y_train.shape
#y_train.ravel().shape
#dir(clf_nb)
print(clf_nb.class_count_)

[4447. 2553.]


In [0]:
predictions = clf_nb.predict(X_test)

In [24]:
#X_test[0,:]
print(type(predictions), predictions.shape)

<class 'numpy.ndarray'> (3000,)


In [25]:
index = 2588
print(y_test[index], predictions[index])

[0] 0


In [26]:
accuracy_score(y_test, predictions)

0.9793333333333333

In [31]:
# Prediction on Single Datapoint
index = 0
x_single = X_test[index,:]
print(x_single)
print(y_test[index,:])

[ 6.99533860e-01  8.88416103e+00  5.75583229e+00  2.17221217e+00
  2.46918580e+00 -5.21008864e-01 -1.10994309e+00 -8.38233852e-01
  3.48320882e-01  9.63648952e-01  1.63579565e-01  3.07815758e-01
 -5.37343405e-03]
[1]


In [37]:
#x_single.reshape(-1,1).shape
prediction_xsingle = clf_nb.predict(x_single.reshape(1,-1))
print(X_test[0,:])

[ 6.99533860e-01  8.88416103e+00  5.75583229e+00  2.17221217e+00
  2.46918580e+00 -5.21008864e-01 -1.10994309e+00 -8.38233852e-01
  3.48320882e-01  9.63648952e-01  1.63579565e-01  3.07815758e-01
 -5.37343405e-03]


In [40]:
print(prediction_xsingle, y_test[index])

[1] [1]


In [41]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print((tn, fp, fn, tp))

(1898, 35, 27, 1040)


In [43]:
# Print precision, recall and F1-score
precision_recall_fscore_support(y_test, predictions, pos_label=1, average='binary')

(0.9674418604651163, 0.9746954076850984, 0.9710550887021475, None)