# Lesson 2 Assignment
Rob Lisy


- Read data
- Build a classifier
- Determine your model accuracy
- Modify data by handling class imbalance
- Use the same model on updated data
- What is the accuracy?
- Describe your findings

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Some options...
sns.set(style="whitegrid", color_codes=True)
sns.set(rc={'figure.figsize':(16.7,13.27)})
pd.options.mode.chained_assignment = None

# Classifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

file = 'https://library.startlearninglabs.uw.edu/DATASCI420/2019/Datasets/Intrusion%20Detection.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Class
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


In [2]:
# Just under 1 million rows, 42 columns...
# the "class" column contains the outcome we're interested in.
df.shape

(97308, 42)

In [3]:
df.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [4]:
# How often do we see the different classes?
df['Class'].value_counts()

# Only 30 positive classes!! Whoa!

0    97278
1       30
Name: Class, dtype: int64

In [5]:
# Let's do some slight data cleaning...
cat_cols = ['protocol_type', 'service', 'flag']

for c in cat_cols:
    df[c] = df[c].astype('category')

In [6]:
# Make matrices for the continuous data and the class, make a test / train split...
X = df.loc[:, df.columns != 'Class'].copy()
X = X.drop(cat_cols, axis=1).copy()

y = df.loc[:, df.columns == 'Class'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32768)

In [7]:
# We're going to try Stochastic Gradient Descent
clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(X_train, y_train.values.ravel())

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier())])

In [8]:
X_test['predicted'] = clf.predict(X_test)

In [9]:
X_test['predicted'].value_counts()

0    19456
1        6
Name: predicted, dtype: int64

In [10]:
# Make the confusion matrix to see how the model did...
confusion_matrix(y_test, X_test['predicted'])

# So we got a few right, and a few misclassifications...

array([[19456,     2],
       [    0,     4]])

In [18]:
# And show our accuracy score
m1 = accuracy_score(y_test, X_test['predicted'])
print(f"Accuracy of first model was: {m1*100}%")

Accuracy of first model was: 99.96402692841359


# Handle Class Imbalance

Now that we have a model which gives some performance (it's... meh.), let's see how we can do if we change the class imbalance.

In [12]:
# We're going to upsample the minority class
from sklearn.utils import resample


# Separate majority and minority classes
df_majority = df[df['Class']==0]
df_minority = df[df['Class']==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=len(df['Class']==0),
                                 random_state=32768)
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled['Class'].value_counts()

1    97308
0    97278
Name: Class, dtype: int64

In [13]:
# Now let's train a new model on the rebalanced classes.
# Make matrices for the continuous data and the class, make a test / train split...
X = df_upsampled.loc[:, df_upsampled.columns != 'Class'].copy()
X = X.drop(cat_cols, axis=1).copy()

y = df_upsampled.loc[:, df_upsampled.columns == 'Class'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32768)

In [14]:
# We're going to try Stochastic Gradient Descent
clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(X_train, y_train.values.ravel())

X_test['predicted'] = clf.predict(X_test)
X_test['predicted'].value_counts()

1    19632
0    19286
Name: predicted, dtype: int64

In [15]:
# Now let's look at the new confusion matrix
confusion_matrix(y_test, X_test['predicted'])

array([[19286,    14],
       [    0, 19618]])

In [19]:
# And calculate our accuracy score
m2 = accuracy_score(y_test, X_test['predicted'])
print(f"Accuracy of first model was: {m2*100}%")

Accuracy of first model was: 99.96402692841359%


# Summary

Accuracy=(TP+TN)/(TP+TN+FP+FN)

The accuracy score in this case is the same, which is crazy. Upsampling this data though does seem to improve the performance.