In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler # For scaling to unit scale, before PCA application
from sklearn.decomposition import PCA # For PCA dimensionality reduction technique
import matplotlib.pyplot as plt # MatPlotLib for graphing data visually. Seaborn more likely to be used.
from sklearn.preprocessing import LabelBinarizer # For converting categorical data into numeric, for modeling stage
from sklearn.model_selection import StratifiedKFold # For optimal train_test splitting, for model input data
from sklearn.model_selection import train_test_split # For basic dataset splitting
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbors ML classifier (default n. of neighbors = 5)
#from scikitplot.metrics import plot_confusion_matrix # For plotting confusion matrices
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score # For getting the accuracy of a model's predictions
from sklearn.metrics import classification_report # Various metrics for model performance
import warnings
warnings.filterwarnings('ignore')

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (C:\ProgramData\anaconda3\Lib\site-packages\sklearn\metrics\__init__.py)

# Useful environment variables

In [None]:
# 'Reduced dimensions' variable for altering the number of PCA principal components. Can be altered for needs.
# Only 7 principal components needed when using non-normalised PCA dataset.
dimensions_num_for_PCA = 7

# Max number of permutations to run. Can be altered for needs.
number_of_permutations = 100

# 10 folds is usually the heuristic to follow for larger datasets of around this size.
num_of_splits_for_skf = 10
# Seed value to pass into models so that repeated runs result in the same output
seed_val = 1

# Number of statistical distance measures to run (for the results, columns section)
num_of_statistical_dist_measures = 6

In [None]:
def get_PCA_feature_names(num_of_pca_components):
    feature_names = []
    for i in range(num_of_pca_components):    
        feature_names.append(f"Principal component {i+1}")
    return feature_names

In [None]:
# See documentation above to understand what each step does, and why.
def train_model_predict(model, model_name, X, y, skf):
    for train_index, test_index in skf.split(X, y): # 1)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index] # 2)
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        reshaped_y_train = np.asarray(y_train).reshape(-1, 1) # 3)
        reshaped_y_test = np.asarray(y_test).reshape(-1, 1)
        
    model.fit(X_train, reshaped_y_train.ravel()) # 4)
    pred_y = model.predict(X_test) # 5)
    score = classification_report(reshaped_y_test, pred_y) # 6)
    print('Classification report: \n', score, '\n')
    cm=plot_confusion_matrix(reshaped_y_test, pred_y)
        
    return accuracy_score(reshaped_y_test, pred_y), X_train, X_test, y_train, pred_y

In [None]:
data=pd.read_csv('Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
data.head()

In [None]:
data.dtypes

# Fixing issues with ScikitLearn's PCA transform on this dataset

In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep]

In [None]:
data_cleaned = data.copy()
data_cleaned = clean_dataset(data_cleaned) # see methods at top of notebook
data_cleaned

In [None]:
data_cleaned = data_cleaned.reset_index()
# Removing un-needed index column added by reset_index method
data_cleaned.drop('index', axis=1, inplace=True)
data_cleaned

### Considerations before PCA can be used correctly (before Data Preparation feature selection via PCA)

In [None]:
data.columns.tolist()

In [None]:
# Saving the label attribute before dropping it.
data_labels = data_cleaned['label']
# Shows all the possible labels/ classes a model can predict.
# Need to alter these to numeric 0, 1, etc... for model comprehension (e.g. pd.get_dummies()).
data_labels.unique()

In [None]:
# Axis=1 means columns. Axis=0 means rows. inplace=False means that the original 'df' isn't altered.
data_no_labels = data_cleaned.drop('label', axis=1, inplace=False)
# Getting feature names for the StandardScaler process
data_features = data_no_labels.columns.tolist()
# Printing out Dataframe with no label column, to show successful dropping
data_no_labels

# Using StandardScaler to transform features into unit scale (optional for PCA)

In [None]:
data_no_labels.describe()

In [None]:
print(np.any(np.isnan(data_no_labels)))

print(np.any(np.isfinite(data_no_labels)))

In [None]:
final=data_no_labels.replace([np.inf, -np.inf], np.nan)

In [None]:
final.dropna(inplace=True)


In [None]:
print(np.any(np.isnan(final)))
print(np.all(np.isfinite(final)))

In [None]:
data_scaled = StandardScaler().fit_transform(final.select_dtypes(include=['float64','int64']))
# Converting back to dataframe
data_scaled = pd.DataFrame(data = data_scaled, columns = data_features)
data_scaled

# Plotting principle component variance

####  The plot below shows that using the first 30 PCA components actually describes most/ all (99.9%) of the variation (information) within the Normalised dataset. This is a huge dimension reduction from the initial 78 features, down to just 30.

In [None]:
pca_test = PCA().fit(data_scaled)
plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

In [None]:
# The df_no_labels dataset holds the un-normalised dataset.
pca_test = PCA().fit(data_no_labels)
plt.plot(np.cumsum(pca_test.explained_variance_ratio_))
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

# Important note on these above scree plot results 

# Now fitting and transforming the data with PCA

In [None]:
pca = PCA(n_components=dimensions_num_for_PCA)
#principal_components = pca.fit(df_scaled).transform(df_scaled) => for normalised PCA

# Non-normalised PCA
principal_components = pca.fit(data_no_labels).transform(data_no_labels)
principal_components

# Getting Principal Component feature names, dynamically

In [None]:
# See Methods at the top of the notebook
principal_component_headings = get_PCA_feature_names(dimensions_num_for_PCA)

In [None]:
data_pc = pd.DataFrame(data = principal_components, columns = principal_component_headings)
data_pc

In [None]:
data_final = pd.concat([data_pc, data_labels], axis = 1)
# Scroll to the RHS end of dataframe to see attached label feature
data_final

In [None]:
lb = LabelBinarizer()
data_final['label'] = lb.fit_transform(data_final['label'])
data_final

In [None]:
print("Before LabelBinarizer: ", data_labels.unique())
print("After LabelBinarizer: ", data_final['label'].unique())

In [None]:
# Separating the label so that the answers aren't provided to the model, in training.
X = data_final.drop(['label'], axis = 1)
y = data_final['label']
y

In [None]:
skf = StratifiedKFold(n_splits=num_of_splits_for_skf, shuffle=False)
skf

In [None]:
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    reshaped_y_train = np.asarray(y_train).reshape(-1, 1)
    reshaped_y_test = np.asarray(y_test).reshape(-1, 1)
    
print( 'X_train length: ', len(X_train) ) # To check if splits worked
print( 'y_train length: ', len(y_train) )
print( 'X_test length: ', len(X_test) )
print( 'y_test length: ', len(y_test) )

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5, weights='uniform',
                                    algorithm='auto', leaf_size=30,
                                    p=2, metric='minkowski',
                                    metric_params=None, n_jobs=None)

In [None]:
# Unpacking the method return values. Last 4 are needed for statistical distance measure methods.
accuracy, X_train, X_test, y_train, pred_y = train_model_predict(knn_model, "K-Nearest Neighbor", X, y, skf)
print("Model accuracy= ", accuracy*100, "%\n")
print("Dataset labels: ", data_labels.unique())
print("Dataset numeric labels after encoding for model: ", data_final['label'].unique())