# Experiment 6: Classification

In [25]:
import pickle

import pandas as pd
import numpy as np
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000

from sklearn import model_selection
from sklearn import tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv("../../data/energy/df.csv", )
df.head()

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,energy,label,group
0,1054,32658,-16.064,-76.599,112.441,0.0,0.955,-0.296,35,28449.0,13.415,1,1
1,1054,32671,-16.208,-76.707,112.611,-0.719,0.415,0.558,24,28450.0,13.415,1,1
2,1053,32639,-16.064,-76.624,122.011,0.0,0.83,0.558,12,28450.0,13.415,1,1
3,1054,32670,-16.064,-76.624,112.611,0.0,0.83,0.558,54,28450.0,13.415,1,1
4,1054,32664,-15.968,-76.625,112.559,0.478,0.827,0.296,47,28451.0,13.415,1,1


In [5]:
df.describe()

Unnamed: 0,dom_id,pmt_id,pos_x,pos_y,pos_z,dir_x,dir_y,dir_z,tot,time,energy,label,group
count,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0,489906.0
mean,1048.735653,32496.026419,0.34167,-1.514543,119.205209,0.000912,-0.002101,-0.182805,26.056821,48938920.0,37.133466,1.0,3262.098839
std,592.666757,18372.66124,50.964521,62.401296,47.553276,0.582604,0.583274,0.535609,9.273521,29056000.0,24.312587,0.0,1937.060856
min,1.0,1.0,-94.627,-115.6,37.7,-0.955,-0.955,-1.0,1.0,28449.0,10.002,1.0,1.0
25%,537.0,16622.0,-44.772,-58.113,74.211,-0.478,-0.478,-0.556,23.0,23937850.0,16.872,1.0,1595.0
50%,1056.0,32727.0,1.424,-4.581,121.789,0.0,-0.0,-0.296,26.0,48101400.0,29.505,1.0,3206.0
75%,1562.0,48398.75,40.478,48.48,160.241,0.478,0.478,0.296,28.0,74346300.0,52.643,1.0,4956.0
max,2070.0,64170.0,96.243,105.024,196.611,0.955,0.955,0.558,209.0,99986010.0,99.947,1.0,6665.0


# HoldOut Set

In [6]:
np.random.seed(20)

remove_n = 20

drop_indices = np.random.choice(df.index, remove_n, replace=False)

hold_df = df.iloc[drop_indices]
df = df.drop(drop_indices)

# Define target and predictors

In [52]:
y = df['energy']

X = df[['pos_x', 'pos_y', 'pos_z', 'time']]

In [53]:
le = preprocessing.LabelEncoder()
le.fit_transform(y)

array([1212, 1212, 1212, ..., 4468, 4468, 4468])

In [55]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

# Split Dataset 

Dataset needs to be split into:
1. Main
    1. Train
    2. Test

# Train/Test Split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, random_state=40)

print(" X TRAIN SHAPE: {} \n X TEST SHAPE: {} \n".format(X_train.shape, X_test.shape))
print(" Y TRAIN SHAPE: {} \n Y TEST SHAPE: {}".format(y_train.shape, y_test.shape))

 X TRAIN SHAPE: (323324, 4) 
 X TEST SHAPE: (166562, 4) 

 Y TRAIN SHAPE: (323324,) 
 Y TEST SHAPE: (166562,)


# Model 1: Decision Tree

In [22]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [23]:
clf.predict(X_train)

array([ 434, 1156, 4853, ..., 4308, 1642, 4858])

In [27]:
y_pred = clf.predict(X_test)

# Model 1: Results

In [29]:
accuracy_score(y_test, y_pred)

0.9900997826635127

In [30]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 37,   0,   0, ...,   0,   0,   0],
       [  0,   5,   0, ...,   0,   0,   0],
       [  0,   0,  21, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   1,   0,   0],
       [  0,   0,   0, ...,   0, 330,   0],
       [  0,   0,   0, ...,   0,   0, 277]])

In [42]:
# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_pred))

# Save Model

In [43]:
filename = 'clas_model.sav'
pickle.dump(clf, open(filename, 'wb'))

# Test on Unseen Data

In [44]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9900997826635127


In [45]:
Xnew = hold_df[['pos_x',
               'pos_y',
               'pos_z',
               'time']]
hold_y = hold_df['energy']

ynew = loaded_model.predict(Xnew)

In [49]:
ynew

array([2577, 3116, 1485, 1036, 4183,  637, 1537, 2085, 2577, 4778,  689,
       3434, 3610, 4698, 1563, 4763,  846, 4597, 2731, 4567])

In [56]:
hold_y

358818    20.061
159680    24.873
222390    14.403
437957    12.872
13266     42.506
101351    11.635
38050     14.601
28437     17.104
358728    20.061
327353    67.561
468183    11.780
20031     28.233
17630     30.629
200148    62.514
458203    14.684
246550    66.896
204478    12.263
413972    57.580
204884    21.426
439503    56.253
Name: energy, dtype: float64

In [67]:
energy

[2577,
 3116,
 1485,
 1036,
 4183,
 637,
 1537,
 2085,
 2577,
 4778,
 689,
 3434,
 3610,
 4698,
 1563,
 4763,
 846,
 4597,
 2731,
 4567]

In [73]:
energy = []
for v in hold_y:
    energy.append(le_name_mapping[v])

In [66]:
accuracy_score(energy, ynew)

1.0

In [68]:
from sklearn.metrics import recall_score

In [72]:
recall_score(energy[:1], ynew[:1])

0.0

In [None]:
energy vs recall plot