### DBSCAN

#### Importando as bibliotecas necessárias

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
  
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

#### Carregando dados

In [2]:
data = {}

data[0] = pd.read_csv('../Databases/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
data[1]= pd.read_csv('../Databases/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')
data[2] = pd.read_csv('../Databases/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv')
data[3] = pd.read_csv('../Databases/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv')
data[4] = pd.read_csv('../Databases/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')
data[5] = pd.read_csv('../Databases/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv')
data[6] = pd.read_csv('../Databases/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv')
data[7] = pd.read_csv('../Databases/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv')

X = pd.concat(data)

# Dropping the CUST_ID column from the data
X = X.drop(' Label', axis = 1)
  
# Handling the missing values
X.fillna(method ='ffill', inplace = True)

#### Pré-processamento de dados

In [4]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

X = clean_dataset(X)
# Scaling the data to bring all the attributes to a comparable level
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
  
# Normalizing the data so that 
# the data approximately follows a Gaussian distribution
X_normalized = normalize(X_scaled)
  
# Converting the numpy array into a pandas DataFrame
X_normalized = pd.DataFrame(X_normalized)

#### Reduzindo as dimensões da base

In [5]:
pca = PCA(n_components = 2)
X_principal = pca.fit_transform(X_normalized)
X_principal = pd.DataFrame(X_principal)
X_principal.columns = ['P1', 'P2']

In [5]:
print(X_principal)

               P1        P2
0        0.145484  0.607507
1        0.083891  0.631055
2        0.085802  0.635701
3        0.069003  0.643871
4        0.145478  0.607507
...           ...       ...
2827871 -0.553494  0.016521
2827872 -0.635471 -0.381917
2827873  0.238153  0.390771
2827874 -0.565916 -0.140663
2827875 -0.588767 -0.188850

[2827876 rows x 2 columns]


#### Construindo o modelo de clusterização

In [6]:
# Numpy array of all the cluster labels assigned to each data point
# db_default = DBSCAN(eps = 0.0375, min_samples = 3).fit(X_principal)
db_default = DBSCAN()
# labels = db_default.labels_

In [None]:
db_default = db_default.fit(X_principal)

#### Visualizando cada cluster

In [None]:
# Building the label to colour mapping
colours = {}
colours[0] = 'r'
colours[1] = 'g'
colours[2] = 'b'
colours[-1] = 'k'
  
# Building the colour vector for each data point
cvec = [colours[label] for label in labels]
  
# For the construction of the legend of the plot
r = plt.scatter(X_principal['P1'], X_principal['P2'], color ='r');
g = plt.scatter(X_principal['P1'], X_principal['P2'], color ='g');
b = plt.scatter(X_principal['P1'], X_principal['P2'], color ='b');
k = plt.scatter(X_principal['P1'], X_principal['P2'], color ='k');
  
# Plotting P1 on the X-Axis and P2 on the Y-Axis 
# according to the colour vector defined
plt.figure(figsize =(9, 9))
plt.scatter(X_principal['P1'], X_principal['P2'], c = cvec)
  
# Building the legend
plt.legend((r, g, b, k), ('Label 0', 'Label 1', 'Label 2', 'Label -1'))
  
plt.show()