# **Intrusion Detection System Using Logistic Regression**

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


## Inputing the Data

In [2]:
df_train = pd.read_csv('https://raw.githubusercontent.com/raja045/Machine-Learning/DatasetsUsedForProjects/IDS_DataSet.csv')

# You can also input csv file directly,
#df_train = pd.read_csv(' filepath in google colab ').

In [3]:
df_train.sample(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome,level
72879,0,tcp,http,SF,0,1101,0,0,0,0,...,1.0,0.0,0.17,0.08,0.17,0.0,0.0,0.0,normal,20
100905,0,tcp,http,SF,255,285,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
117370,0,tcp,ftp_data,SF,2191,0,0,0,0,0,...,0.49,0.02,0.49,0.0,0.0,0.0,0.0,0.0,normal,20
14150,0,tcp,http,SF,229,280,0,0,0,0,...,1.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,normal,21
2923,0,tcp,http,SF,335,3490,0,0,0,0,...,1.0,0.0,0.07,0.05,0.0,0.0,0.0,0.0,normal,21
69306,0,udp,domain_u,SF,45,131,0,0,0,0,...,0.71,0.02,0.01,0.0,0.0,0.0,0.0,0.0,normal,21
71639,0,tcp,whois,S0,0,0,0,0,0,0,...,0.07,0.08,0.0,0.0,1.0,1.0,0.0,0.0,neptune,20
47077,0,tcp,ftp,RSTO,0,0,0,0,0,0,...,0.08,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,20
42194,0,tcp,ftp_data,SF,15377,0,0,0,0,0,...,1.0,0.0,1.0,0.08,0.0,0.0,0.0,0.0,normal,19
26854,0,tcp,iso_tsap,REJ,0,0,0,0,0,0,...,0.07,0.09,0.0,0.0,0.0,0.0,1.0,1.0,neptune,18


In [4]:
df_train.shape


(125972, 43)

In [5]:
df_train.info()
# info() method plays crucial role here, Because columns which are having object Dtype has to be converted into numerical form.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125972 entries, 0 to 125971
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125972 non-null  int64  
 1   protocol_type                125972 non-null  object 
 2   service                      125972 non-null  object 
 3   flag                         125972 non-null  object 
 4   src_bytes                    125972 non-null  int64  
 5   dst_bytes                    125972 non-null  int64  
 6   land                         125972 non-null  int64  
 7   wrong_fragment               125972 non-null  int64  
 8   urgent                       125972 non-null  int64  
 9   hot                          125972 non-null  int64  
 10  num_failed_logins            125972 non-null  int64  
 11  logged_in                    125972 non-null  int64  
 12  num_compromised              125972 non-null  int64  
 13 

In [6]:
df_train.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,level
count,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,...,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0
mean,287.146929,45567.1,19779.27,0.000198,0.022688,0.000111,0.204411,0.001222,0.395739,0.279253,...,115.653725,0.521244,0.082952,0.148379,0.032543,0.284455,0.278487,0.118832,0.120241,19.504056
std,2604.525522,5870354.0,4021285.0,0.014086,0.253531,0.014366,2.149977,0.045239,0.489011,23.942137,...,110.702886,0.44895,0.188922,0.308998,0.112564,0.444785,0.44567,0.306559,0.31946,2.291512
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0,20.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0,21.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0


## Data Preprocessing

In [7]:
Missingvalues = df_train.isnull().sum()
Missingvalues

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [8]:
# info() tells us which columns are having Dtype 'object'
# Convert those columns into numerical form.
# By using label_encoders.

categorical_columns = ['protocol_type', 'service', 'flag', 'outcome']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    label_encoders[col] = le


In [9]:
# This Data preprocessing technique is called Feature scaling or normalization
# This ensures that all features contribute equally and are treated fairly during training.

numerical_columns = df_train.columns.difference(categorical_columns + ['level'])
scaler = StandardScaler()
df_train[numerical_columns] = scaler.fit_transform(df_train[numerical_columns])


## Training the Model

In [10]:
X = df_train.drop(columns=['outcome', 'level'])
y = df_train['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Building the Model

In [11]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Performance evaluation

In [12]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.9855526890256003
Precision: 0.985566052986974
Recall: 0.9855526890256003
F1 Score: 0.9854806104329116


  _warn_prf(average, modifier, msg_start, len(result))
