In [1]:
# Required modules
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import IPython
import sklearn
import graphviz
import mglearn
import sys

print("Python Version: {}".format(sys.version))
print("Pandas Version: {}".format(pd.__version__))
print("matplotlib Version: {}".format(matplotlib.__version__))
print("NumPy Version: {}".format(np.__version__))
print("SciPy Version: {}".format(sp.__version__))
print("IPython Version: {}".format(IPython.__version__))
print("Scikit-learn Version: {}".format(sklearn.__version__))
print("graphviz Version: {}".format(graphviz.__version__))
print("Mglearn Version: {}".format(mglearn.__version__))

Python Version: 3.5.3 (default, May 10 2017, 15:05:55) 
[GCC 6.3.1 20161221 (Red Hat 6.3.1-1)]
Pandas Version: 0.20.1
matplotlib Version: 2.0.2
NumPy Version: 1.12.1
SciPy Version: 0.19.0
IPython Version: 6.0.0
Scikit-learn Version: 0.18.1
graphviz Version: 0.7.1
Mglearn Version: 0.1.5


In [2]:
# Constant parameters given by the DySpan PU setup                              
DELAY_1 = 0.005 # tau1                                                          
DELAY_2 = 0.01  # tau2                                                          
TCONST = 0.002                                                                  
MEAN1 = 0.02    # lambda1                                                       
MEAN2 = 0.01    # lambda2                                                       
MEAN3 = 0.005   # lambda3 
N_CHAN = 4      # Number of channels
N_SCN = 10      # Number of scenarios
N_SAMPS = 4000  # Number of samples in the dataset per scenario

In [3]:
# Create iterables for packet rate and variance
if_time_scn_ch = [[[channel] for channel in range(N_CHAN)] 
                  for scenario in range(N_SCN)]
packet_rate_scn = [[] for scenario in range(N_SCN)]
variance_scn = [[] for scenario in range(N_SCN)]

for scenario in range(N_SCN):
    for channel in range(N_CHAN):
        if_time_scn_ch[scenario][channel] = sp.fromfile(open(
            "../../data/feature_extraction/2/interframe_time_ch_{}_scn_{}.dat"
            .format(channel+1, scenario)), dtype=sp.float32)
    packet_rate_scn[scenario] = sp.fromfile(open("../../data/feature_extraction/2/packet_rate_scn_{}.dat".format(scenario)),
                                                dtype=sp.float32)
    variance_scn[scenario] = sp.fromfile(open("../../data/feature_extraction/2/variance_scn_{}.dat".format(scenario)),
                                             dtype=sp.float32)

In [4]:
# Generate a vector that includes the interframe time for all channels
if_vector = [[] for i in range(N_SAMPS*N_SCN)]
for scn in range(N_SCN):
    for i in range(N_SAMPS):
        for chan in range(N_CHAN):
            if_vector[i + N_SAMPS*scn].append(if_time_scn_ch[scn][chan][i])

In [5]:
# Generate label vector
labels = [i for i in range(N_SCN) for n in range(N_SAMPS)]

In [6]:
# https://stackoverflow.com/questions/17485747/how-to-convert-a-nested-list-into-a-one-dimensional-list-in-python
from collections import Iterable
def flatten(lis):
     for item in lis:
         if isinstance(item, Iterable) and not isinstance(item, str):
             for x in flatten(item):
                 yield x
         else:        
             yield item

In [7]:
# Generate data list that includes all data in a list per frames
data_nested = []
# first generate a long list that includes the packet_rates one scenario
# after the other, and the same for the variances
# packet_rate = [scn0, scn1, ..., scn9]
# len(packet_rate) = N_SAMPS * N_SCN
packet_rate = []
variance = []
for scn in range(N_SCN):
    for i in range(N_SAMPS):
            packet_rate.append(packet_rate_scn[scn][i])
            variance.append(variance_scn[scn][i])

data_nested = list(zip(if_vector, packet_rate, variance))
# Until this point 'data' is a nested list. It needs to be flattened 
# to use it with sci-kit
# TODO: just don't generate it nested and save this method...
data = [[] for i in range(len(data_nested))]
for i in range(len(data_nested)):
    data[i] = list(flatten(data_nested[i]))

In [8]:
# Now the data is ready to start applying sci-kit algorithms
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data, labels, random_state=0)

In [9]:
# We start with the KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier

# Generate the model
knn = KNeighborsClassifier(n_neighbors=2)
# Train the model
knn.fit(X_train, y_train)
# Make predictions based on the test data
prediction = knn.predict(X_test)

In [10]:
# Check the model accuracy
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, prediction))

0.9982


In [13]:
# testing training in a loop
knn_list = []
predictions = []
accs = []
n_neighbors = [2, 4, 10, 50]
for n in range(len(n_neighbors)):
    knn_list.append(KNeighborsClassifier(n_neighbors=n_neighbors[n]))
    knn_list[n].fit(X_train, y_train)
    predictions.append(knn_list[n].predict(X_test))
    accs.append(accuracy_score(y_test, predictions[n]))

print(accs)

[0.99819999999999998, 0.99819999999999998, 0.99809999999999999, 0.99660000000000004]


In [15]:
# Training in a loop using SVM
# http://scikit-learn.org/stable/modules/svm.html
from sklearn.svm import SVC
from time import time
svc_list = []
svc_pred = []
svc_accs = []
svc_fit_times = []
svc_pred_times = []
svc_complexities = [1, 10, 100, 1000, 10000, 100000]

for n in range(len(svc_complexities)):
    svc_list.append(SVC(kernel='rbf', C=float(svc_complexities[n])))
    t0 = time()
    svc_list[n].fit(X_train, y_train)
    svc_fit_times.append(round(time() - t0, 3))
    t0 = time()
    svc_pred.append(svc_list[n].predict(X_test))
    svc_pred_times.append(round(time() - t0, 3))
    svc_accs.append(accuracy_score(y_test, svc_pred[n]))
    
print(svc_accs)
print(svc_fit_times)
print(svc_pred_times)

1
10
100
1000
10000
100000
[0.99050000000000005, 0.99170000000000003, 0.99170000000000003, 0.99170000000000003, 0.99170000000000003, 0.99170000000000003]
[13.882, 14.815, 14.219, 14.727, 13.764, 14.338]
[1.559, 1.497, 1.48, 1.506, 1.536, 1.562]


In [11]:

# TODO: http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html