# Cleaning data

In [37]:
import pandas as pd 
import numpy as np
import csv

In [361]:
esp_df = pd.read_excel('final.esp.xlsx')

In [371]:
esp_df.columns

Index(['esp.sr.no', 'vendor', 'depth', 'perf', 'date.install', 'water.cut',
       'daily.flow', 'num.work.orders', 'avg.uptime',
       'avg.maintenance.cost.monthly', 'daily.production.variance', 'Ti.C',
       'time', 'Pi.Psia', 'Vx.g', 'Vy.g', 'Tm.C', 'MOR.Ohm', 'Lv.V', 'status'],
      dtype='object')

In [363]:
esp_df.isnull().values.any()


True

In [364]:
esp_df.isnull().sum().sum()

300

In [365]:
esp_df = esp_df.dropna()

In [366]:
esp_df.isnull().sum().sum()

0

In [367]:
esp_df = esp_df[["esp.sr.no",  "vendor", "depth", "perf", "date.install", "water.cut", "daily.flow", "num.work.orders", "avg.uptime", "avg.maintenance.cost.monthly", "daily.production.variance", "Ti.C","time", "Pi.Psia", "Vx.g","Vy.g", "Tm.C", "MOR.Ohm", "Lv.V", "status"]]

esp_df.columns

Index(['esp.sr.no', 'vendor', 'depth', 'perf', 'date.install', 'water.cut',
       'daily.flow', 'num.work.orders', 'avg.uptime',
       'avg.maintenance.cost.monthly', 'daily.production.variance', 'Ti.C',
       'time', 'Pi.Psia', 'Vx.g', 'Vy.g', 'Tm.C', 'MOR.Ohm', 'Lv.V', 'status'],
      dtype='object')

In [374]:
#str(esp_df['esp.sr.no'])
type(esp_df['esp.sr.no'])
#esp_df.apply(lambda x: pd.to_float(x, errors='ignore'))

pandas.core.series.Series

In [375]:
esp_v1_df = esp_df[esp_df["vendor"] == "Vendor 1"]
esp_v2_df = esp_df[esp_df["vendor"] == "Vendor 2"]
esp_v3_df = esp_df[esp_df["vendor"] == "Vendor 3"]


# split testing/training data

In [376]:
from sklearn.cross_validation import train_test_split
esp_v1_df_train, esp_v1_df_test = train_test_split(esp_v1_df, test_size = 0.3)
esp_v2_df_train, esp_v2_df_test = train_test_split(esp_v2_df, test_size = 0.3)
esp_v3_df_train, esp_v3_df_test = train_test_split(esp_v3_df, test_size = 0.3)


In [377]:
esp_v1_df_train.shape[0]

18078

In [378]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

### KNN

#### vendor 1

In [340]:
from sklearn import neighbors

In [379]:
v1_knn1_model = neighbors.KNeighborsClassifier(n_neighbors = 3)
v1_knn2_model = neighbors.KNeighborsClassifier(n_neighbors = 20)

In [380]:
attributes_columns = ["Pi.Psia", "Vx.g", "MOR.Ohm", "Lv.V", "Vy.g"]
attributes = esp_v1_df_train[list(attributes_columns)].values
class_label = esp_v1_df_train["status"].values

In [370]:
v1_knn1_model.fit(attributes, class_label)
v1_knn2_model.fit(attributes, class_label)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')

In [344]:
prediction1 = v1_knn1_model.predict(esp_v1_df_test[attributes_columns]) 
prediction2 = v1_knn2_model.predict(esp_v1_df_test[attributes_columns]) 
print(prediction1)
print(prediction2)



In [345]:
accuracy1 = np.where(prediction1==esp_v1_df_test['status'], 1, 0).sum() / float(len(esp_v1_df_test))
accuracy2 = np.where(prediction2==esp_v1_df_test['status'], 1, 0).sum() / float(len(esp_v1_df_test))

In [346]:
accuracy3

0.87572590011614404

In [347]:
accuracy12

0.88140405213575945

In [348]:
accuracy1_array = confusion_matrix(esp_v1_df_test['status'], prediction1)
accuracy2_array = confusion_matrix(esp_v1_df_test['status'], prediction2)

In [349]:
accuracy1_array

array([[ 111,   78,  273],
       [  43, 5983,  142],
       [ 262,  300,  557]])

In [350]:
accuracy2_array

array([[  41,   87,  334],
       [  18, 6019,  131],
       [  95,  325,  699]])

###### knn with a comparison of 12 nearest neighbers has the best accuracy for vendor 1

#### vendor 2

In [351]:
v2_knn1_model = neighbors.KNeighborsClassifier(n_neighbors = 9)
v2_knn2_model = neighbors.KNeighborsClassifier(n_neighbors = 12)

In [352]:
attributes_columns = ["Pi.Psia", "Vx.g", "MOR.Ohm", "Lv.V", "Vy.g"]
attributes = esp_v2_df_train[list(attributes_columns)].values
class_label = esp_v2_df_train["status"].values

In [353]:
v2_knn1_model.fit(attributes, class_label)
v2_knn2_model.fit(attributes, class_label)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=12, p=2,
           weights='uniform')

In [354]:
prediction_v2_1 = v2_knn1_model.predict(esp_v2_df_test[attributes_columns]) 
prediction_v2_2 = v2_knn2_model.predict(esp_v2_df_test[attributes_columns]) 
print(prediction_v2_3)
print(prediction_v2_3)

['Running' 'Running' 'Running' ..., 'Running' 'Running' 'Running']
['Running' 'Running' 'Running' ..., 'Running' 'Running' 'Running']


In [355]:
accuracy_v2_1 = np.where(prediction_v2_1==esp_v2_df_test['status'], 1, 0).sum() / float(len(esp_v2_df_test))
accuracy_v2_2 = np.where(prediction_v2_2==esp_v2_df_test['status'], 1, 0).sum() / float(len(esp_v2_df_test))

In [356]:
accuracy_v2_1

0.89575577066269541

In [357]:
accuracy_v2_2

0.89575577066269541

###### after playing around with the number of clusters, we descovered that 9 is the first number with the greatest accuracy for vendor 2

In [387]:
v3_knn1_model = neighbors.KNeighborsClassifier(n_neighbors = 9)
v3_knn2_model = neighbors.KNeighborsClassifier(n_neighbors = 12)

In [388]:
attributes_columns = ["Pi.Psia", "Vx.g", "MOR.Ohm", "Lv.V", "Vy.g"]
attributes = esp_v3_df_train[list(attributes_columns)].values
class_label = esp_v3_df_train["status"].values

In [389]:
prediction_v3_1 = v3_knn1_model.predict(esp_v3_df_test[attributes_columns]) 
prediction_v3_2 = v3_knn2_model.predict(esp_v3_df_test[attributes_columns]) 
print(prediction_v3_3)
print(prediction_v3_3)

NotFittedError: Must fit neighbors before querying.

In [None]:
accuracy_v3_1 = np.where(prediction_v3_1==esp_v3_df_test['status'], 1, 0).sum() / float(len(esp_v3_df_test))
accuracy_v3_2 = np.where(prediction_v3_2==esp_v3_df_test['status'], 1, 0).sum() / float(len(esp_v3_df_test))

In [None]:
accuracy_v3_1

In [None]:
accuracy_v3_2

### Decision tree

In [None]:
"Pi.Psia", "Vx.g", "MOR.Ohm", "Lv.V

In [104]:
ESP_tree = DecisionTreeClassifier()
ESP_tree.fit(esp_v1_df_train[["Pi.Psia", "Vx.g", "MOR.Ohm", "Lv.V"]], 
                   esp_v1_df_train["status"])


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [105]:
with open("output.dot", "w") as output_file:
    tree.export_graphviz(ESP_tree, out_file=output_file)

In [106]:
prediction_tree = ESP_tree.predict(esp_v1_df_test[["Pi.Psia", "Vx.g", "MOR.Ohm", "Lv.V"]])

accuracy_array = confusion_matrix(esp_v1_df_test['status'], prediction_tree)
accuracy_array

array([[  43,   58,  360],
       [  13, 6064,  114],
       [  97,  237,  763]])

In [107]:
accuracy_array.diagonal().sum().astype(int)/accuracy_array.sum().astype(int)
#0.886566008517228

0.886566008517228

##### vendor 2

In [123]:
ESP2_tree = DecisionTreeClassifier()
ESP2_tree.fit(esp_v2_df_train[["Vx.g", "MOR.Ohm", "Lv.V"]], 
                   esp_v2_df_train["status"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [116]:
with open("output2.dot", "w") as output_file:
    tree.export_graphviz(ESP2_tree, out_file=output_file)

In [124]:
prediction2_tree = ESP2_tree.predict(esp_v2_df_test[[ "Vx.g", "MOR.Ohm", "Lv.V"]])

accuracy2_array = confusion_matrix(esp_v2_df_test['status'], prediction2_tree)
accuracy2_array

array([[   0,  269,    0],
       [   0, 4821,    0],
       [   0,  282,    0]])

In [125]:
accuracy2_array.diagonal().sum().astype(int)/accuracy_array.sum().astype(int)

0.62214479287650015