#Training Classifiers

##Clean and split data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

Drop the time column

Encode brad's office readings to 0/1

In [2]:
def generate_data(filename):
  df = pd.read_csv(filename)
  del df["Unnamed: 0"]
  df = df.sample(frac=1) #shuffle the dataset using random sampling
  df = df.dropna()
  labels = df.pop("feature")
  #df['ben_illumination'] = df['ben_illumination'].mask(df['ben_illumination']<60,0)
  return df,labels

def min_max_scaling(column) :
  return ( column - column.min() ) / ( column.max() - column.min() )

def generate_data_normalized(filename):
  df = pd.read_csv(filename)
  del df["Unnamed: 0"]
  df = df.sample(frac=1) #shuffle the dataset using random sampling
  df = df.dropna()
  labels = df.pop("feature")

  for col in df.columns:
    df[col] = min_max_scaling(df[col])

  return df,labels

##Results Pipeline

In [3]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

def get_classifier_accuracies(trainDataFile, testDataFile, normalized = False):
  if normalized:
    X_train, y_train  = generate_data_normalized(trainDataFile)
    X_test,  y_test   =  generate_data_normalized(testDataFile)
  else:
    X_train, y_train  = generate_data(trainDataFile)
    X_test,  y_test   =  generate_data(testDataFile)
  #SVM
  svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
  accuracy_svm = svm_model_linear.score(X_test, y_test)

  #KNN
  knn = KNeighborsClassifier(n_neighbors = 35).fit(X_train, y_train)
  accuracy_knn = knn.score(X_test, y_test)

  #Bayes
  gnb = GaussianNB().fit(X_train, y_train)
  accuracy_gnb = gnb.score(X_test, y_test)

  return accuracy_svm, accuracy_knn, accuracy_gnb

Data w/ Additional Features

Data w/ Additional Features + Normalization

##Results

### Only sensors in 211

In [4]:
df = pd.read_csv("trainData_without.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,co2,humidity,temp,pm25,voc,illumination,spla,feature
0,2022-08-29 12:00:00,477.0,49.4,21.985,1.0,38.5,12.65,49.85,2
1,2022-08-29 12:00:10,477.5,49.25,22.045,1.0,34.5,12.65,49.9,2
2,2022-08-29 12:00:20,478.0,49.25,22.02,1.0,37.0,12.65,49.95,2
3,2022-08-29 12:00:30,478.0,49.25,22.005,1.0,32.0,12.65,49.9,2
4,2022-08-29 12:00:40,477.5,49.35,21.985,1.0,43.5,12.65,49.85,2


Normalized

In [5]:
svm, knn, gnb = get_classifier_accuracies("trainData_without.csv","testData_without.csv", True)
print("Model         Accuracy")
print("Norm SVM     ",svm)
print("Norm KNN     ",knn)
print("Norm Bayes   ",gnb)

Model         Accuracy
Norm SVM      0.40711175616835993
Norm KNN      0.4259796806966618
Norm Bayes    0.2409288824383164


Non-normalized

In [6]:
svm, knn, gnb = get_classifier_accuracies("trainData_without.csv","testData_without.csv", False)
print("Model       Accuracy")
print("SVM        ",svm)
print("KNN        ",knn)
print("Bayes      ",gnb)

Model       Accuracy
SVM         0.49129172714078373
KNN         0.3193033381712627
Bayes       0.4771407837445573


###Augmented sensor readings

In [8]:
df = pd.read_csv("trainData.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,co2,humidity,temp,pm25,voc,illumination,spla,ben_illumination,ben_spla,brad_office,feature
0,2022-08-29 12:00:00,477.0,49.4,21.985,1.0,38.5,12.65,49.85,56.0,52.7,0.0,2
1,2022-08-29 12:00:10,477.5,49.25,22.045,1.0,34.5,12.65,49.9,56.0,52.7,0.0,2
2,2022-08-29 12:00:20,478.0,49.25,22.02,1.0,37.0,12.65,49.95,56.0,52.8,0.0,2
3,2022-08-29 12:00:30,478.0,49.25,22.005,1.0,32.0,12.65,49.9,56.0,52.8,0.0,2
4,2022-08-29 12:00:40,477.5,49.35,21.985,1.0,43.5,12.65,49.85,56.0,52.8,0.0,2


Normalized

In [9]:
svm, knn, gnb = get_classifier_accuracies("trainData.csv","testData.csv", normalized = True)
print("Model         Accuracy")
print("Norm SVM     ",svm)
print("Norm KNN     ",knn)
print("Norm Bayes   ",gnb)

Model         Accuracy
Norm SVM      0.36468129571577845
Norm KNN      0.26227795193312436
Norm Bayes    0.21839080459770116


Non-normalized

In [10]:
svm, knn, gnb = get_classifier_accuracies("trainData.csv","testData.csv")
print("Model       Accuracy")
print("SVM        ",svm)
print("KNN        ",knn)
print("Bayes      ",gnb)

Model       Accuracy
SVM         0.4994775339602926
KNN         0.625914315569488
Bayes       0.44200626959247646


### BRAD ONLY

In [11]:
df = pd.read_csv("trainData_brad.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,co2,humidity,temp,pm25,voc,illumination,spla,brad_office,feature
0,2022-08-29 12:00:00,477.0,49.4,21.985,1.0,38.5,12.65,49.85,0.0,2
1,2022-08-29 12:00:10,477.5,49.25,22.045,1.0,34.5,12.65,49.9,0.0,2
2,2022-08-29 12:00:20,478.0,49.25,22.02,1.0,37.0,12.65,49.95,0.0,2
3,2022-08-29 12:00:30,478.0,49.25,22.005,1.0,32.0,12.65,49.9,0.0,2
4,2022-08-29 12:00:40,477.5,49.35,21.985,1.0,43.5,12.65,49.85,0.0,2


In [12]:
svm, knn, gnb = get_classifier_accuracies("trainData_brad.csv","testData_brad.csv", normalized = True)
print("Model         Accuracy")
print("Norm SVM     ",svm)
print("Norm KNN     ",knn)
print("Norm Bayes   ",gnb)

Model         Accuracy
Norm SVM      0.41799709724238027
Norm KNN      0.4350507982583454
Norm Bayes    0.22786647314949202


In [13]:
svm, knn, gnb = get_classifier_accuracies("trainData_brad.csv","testData_brad.csv")
print("Model       Accuracy")
print("SVM        ",svm)
print("KNN        ",knn)
print("Bayes      ",gnb)

Model       Accuracy
SVM         0.49310595065312046
KNN         0.3189404934687954
Bayes       0.48548621190130625
