<a href="https://colab.research.google.com/github/springboardmentor2507-alt/AI-Powered-NIDS/blob/Charitha-Pendyala/NIDS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Loading the Dataset and Assigning Column Names**

In [4]:
import pandas as pd

df = pd.read_csv("KDDTrain+_20Percent.txt", header=None)

df.columns = columns
print("STEP 1: RAW DATA LOADED")
print(df.head(), "\n")
print("Shape:", df.shape)


STEP 1: RAW DATA LOADED
   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_same_srv_rate  \
0               0       0    0  ...                    0.17   
1               0       0    0  ...                    0.00   
2               0       0    0  ...                    0.10   
3               0       0    0  ...                    1.00   
4               0       0    0  ...                    1.00   

   dst_host_diff_srv_rate  dst_host_same_src_port_rate  \
0                    0.03                         0.17   
1                    0.60                       

**Assigning Descriptive Column Names to the NSL-KDD Dataset**

In [5]:
columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
    "num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count",
    "dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate",
    "dst_host_srv_rerror_rate",
    "label","difficulty"
]

df.columns = columns

print("\nSTEP 2: AFTER ASSIGNING COLUMN NAMES")
print(df.head(), "\n")
print("Columns:", df.columns.tolist())



STEP 2: AFTER ASSIGNING COLUMN NAMES
   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_same_srv_rate  \
0               0       0    0  ...                    0.17   
1               0       0    0  ...                    0.00   
2               0       0    0  ...                    0.10   
3               0       0    0  ...                    1.00   
4               0       0    0  ...                    1.00   

   dst_host_diff_srv_rate  dst_host_same_src_port_rate  \
0                    0.03                         0.17   
1                    0.60         

**Defining Attack Category Lists for NSL-KDD Classification**

In [6]:
dos_attacks = [
    'back','land','neptune','pod','smurf','teardrop',
    'apache2','mailbomb','processtable','udpstorm'
]

probe_attacks = [
    'satan','ipsweep','nmap','portsweep','mscan','saint'
]

r2l_attacks = [
    'guess_passwd','ftp_write','imap','phf','multihop',
    'warezmaster','warezclient','spy','xlock','xsnoop',
    'snmpgetattack','snmpguess','httptunnel','sendmail','named'
]

u2r_attacks = [
    'buffer_overflow','rootkit','perl','loadmodule','ps','sqlattack','xterm'
]

print("\nSTEP 3: ATTACK CATEGORY LISTS DEFINED")
print("DoS attacks:", dos_attacks)
print("Probe attacks:", probe_attacks)
print("R2L attacks:", r2l_attacks)
print("U2R attacks:", u2r_attacks)



STEP 3: ATTACK CATEGORY LISTS DEFINED
DoS attacks: ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'apache2', 'mailbomb', 'processtable', 'udpstorm']
Probe attacks: ['satan', 'ipsweep', 'nmap', 'portsweep', 'mscan', 'saint']
R2L attacks: ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster', 'warezclient', 'spy', 'xlock', 'xsnoop', 'snmpgetattack', 'snmpguess', 'httptunnel', 'sendmail', 'named']
U2R attacks: ['buffer_overflow', 'rootkit', 'perl', 'loadmodule', 'ps', 'sqlattack', 'xterm']


**Creating a Function to Map Raw Attack Labels to Attack Categories**

In [7]:
def map_attack_category(label):
    if label in dos_attacks:
        return "DoS"
    elif label in probe_attacks:
        return "Probe"
    elif label in r2l_attacks:
        return "R2L"
    elif label in u2r_attacks:
        return "U2R"
    elif label == "normal":
        return "normal"
    else:
        return "unknown"


In [8]:
df["attack_category"] = df["label"].apply(map_attack_category)

print("\nSTEP 5: AFTER MAPPING RAW LABELS → ATTACK CATEGORIES")
print(df[["label", "attack_category"]].head(20))  # show more rows

print("\nCategory Counts:")
print(df["attack_category"].value_counts())



STEP 5: AFTER MAPPING RAW LABELS → ATTACK CATEGORIES
          label attack_category
0        normal          normal
1        normal          normal
2       neptune             DoS
3        normal          normal
4        normal          normal
5       neptune             DoS
6       neptune             DoS
7       neptune             DoS
8       neptune             DoS
9       neptune             DoS
10      neptune             DoS
11      neptune             DoS
12       normal          normal
13  warezclient             R2L
14      neptune             DoS
15      neptune             DoS
16       normal          normal
17      ipsweep           Probe
18       normal          normal
19       normal          normal

Category Counts:
attack_category
normal    13449
DoS        9234
Probe      2289
R2L         209
U2R          11
Name: count, dtype: int64


**Checking the Dataset for Missing Values**

In [10]:
print("\nSTEP 6: CHECKING FOR MISSING VALUES")
print(df.isnull().sum())



STEP 6: CHECKING FOR MISSING VALUES
duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_s

**Identifying and Removing Duplicate Rows from the Dataset**

In [11]:
print("\nSTEP 7: REMOVING DUPLICATE ROWS")
before = df.shape[0]

df.drop_duplicates(inplace=True)

after = df.shape[0]
print("Rows before:", before)
print("Rows after:", after)
print("Duplicates removed:", before - after)



STEP 7: REMOVING DUPLICATE ROWS
Rows before: 25192
Rows after: 25192
Duplicates removed: 0


**Dropping Irrelevant Features (Removing the ‘difficulty’ Column)**

In [12]:
print("\nSTEP 8: DROPPING IRRELEVANT COLUMN → difficulty")
df.drop(columns=["difficulty"], inplace=True)

print("Remaining Columns:", df.columns.tolist())
print(df.head())



STEP 8: DROPPING IRRELEVANT COLUMN → difficulty
Remaining Columns: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'attack_category']
   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     o

**Encoding Categorical Features Using Label Encoding**

In [13]:
from sklearn.preprocessing import LabelEncoder

print("\nSTEP 9: ENCODING CATEGORICAL FEATURES")

cat_cols = ["protocol_type", "service", "flag"]
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le
    print(f"Encoded {col}:")
    print(df[col].head())
    print()



STEP 9: ENCODING CATEGORICAL FEATURES
Encoded protocol_type:
0    1
1    2
2    1
3    1
4    1
Name: protocol_type, dtype: int64

Encoded service:
0    19
1    41
2    46
3    22
4    22
Name: service, dtype: int64

Encoded flag:
0    9
1    9
2    5
3    9
4    9
Name: flag, dtype: int64



**Normalizing Numerical Features Using StandardScaler**

In [14]:
from sklearn.preprocessing import StandardScaler

print("\nSTEP 10: NORMALIZING NUMERICAL FEATURES")

num_cols = df.drop(columns=["label","attack_category"]).columns

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("Numerical features normalized.")
print(df[num_cols].head())



STEP 10: NORMALIZING NUMERICAL FEATURES
Numerical features normalized.
   duration  protocol_type   service      flag  src_bytes  dst_bytes     land  \
0 -0.113551      -0.126061 -0.645384  0.753021  -0.009889  -0.039310 -0.00891   
1 -0.113551       2.215916  0.768925  0.753021  -0.010032  -0.039310 -0.00891   
2 -0.113551      -0.126061  1.090360 -0.739924  -0.010093  -0.039310 -0.00891   
3 -0.113551      -0.126061 -0.452524  0.753021  -0.009996   0.052473 -0.00891   
4 -0.113551      -0.126061 -0.452524  0.753021  -0.010010  -0.034582 -0.00891   

   wrong_fragment    urgent       hot  ...  dst_host_count  \
0       -0.091223 -0.006301 -0.091933  ...       -0.328634   
1       -0.091223 -0.006301 -0.091933  ...        0.732059   
2       -0.091223 -0.006301 -0.091933  ...        0.732059   
3       -0.091223 -0.006301 -0.091933  ...       -1.540854   
4       -0.091223 -0.006301 -0.091933  ...        0.732059   

   dst_host_srv_count  dst_host_same_srv_rate  dst_host_diff_srv_rat

**Splitting the Dataset into Training and Testing Sets**

In [15]:
from sklearn.model_selection import train_test_split

print("\nSTEP 11: TRAIN–TEST SPLIT")

X = df.drop(columns=["label", "attack_category"])
y = df["attack_category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)



STEP 11: TRAIN–TEST SPLIT
Training set shape: (20153, 41)
Testing set shape: (5039, 41)
Training labels shape: (20153,)
Testing labels shape: (5039,)
