In [None]:
import pandas as pd

df = pd.read_csv('/KDDTrain+.txt', header=None)

df.shape, df.head()


((125973, 43),
    0    1         2   3    4     5   6   7   8   9   ...    33    34    35  \
 0   0  tcp  ftp_data  SF  491     0   0   0   0   0  ...  0.17  0.03  0.17   
 1   0  udp     other  SF  146     0   0   0   0   0  ...  0.00  0.60  0.88   
 2   0  tcp   private  S0    0     0   0   0   0   0  ...  0.10  0.05  0.00   
 3   0  tcp      http  SF  232  8153   0   0   0   0  ...  1.00  0.00  0.03   
 4   0  tcp      http  SF  199   420   0   0   0   0  ...  1.00  0.00  0.00   
 
      36    37    38    39    40       41  42  
 0  0.00  0.00  0.00  0.05  0.00   normal  20  
 1  0.00  0.00  0.00  0.00  0.00   normal  15  
 2  0.00  1.00  1.00  0.00  0.00  neptune  19  
 3  0.04  0.03  0.01  0.00  0.01   normal  21  
 4  0.00  0.00  0.00  0.00  0.00   normal  21  
 
 [5 rows x 43 columns])

In [None]:
# Official 41 NSL-KDD column names
feature_names = [
"duration","protocol_type","service","flag","src_bytes","dst_bytes",
"land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
"num_compromised","root_shell","su_attempted","num_root","num_file_creations",
"num_shells","num_access_files","num_outbound_cmds","is_host_login",
"is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
"rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
"srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate",
"dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
"dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate"
]

# Add the last 2 columns: attack label + difficulty level
feature_names += ["attack_type", "difficulty_level"]

# Assign the names to the dataframe
df.columns = feature_names

# Display first few rows to confirm
df.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [None]:
# Define categories using mentor's lists
dos_attacks = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop',
               'apache2', 'mailbomb', 'processtable', 'udpstorm', 'worm',
               'buffer_overflow']

probe_attacks = ['satan', 'ipsweep', 'nmap', 'portsweep', 'mscan', 'saint']

r2l_attacks = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop',
               'warezmaster', 'warezclient', 'spy', 'xlock', 'xsnoop',
               'snmpgetattack', 'snmpguess', 'httptunnel', 'sendmail',
               'named']

u2r_attacks = ['rootkit', 'perl', 'loadmodule', 'ps', 'sqlattack']

# Function to map raw label -> category
def map_attack(label):
    label = label.strip()  # remove spaces

    if label in dos_attacks:
        return "DoS"
    elif label in probe_attacks:
        return "Probe"
    elif label in r2l_attacks:
        return "R2L"
    elif label in u2r_attacks:
        return "U2R"
    else:
        return "normal"   # any remaining label is normal

# Apply mapping function
df['attack_category'] = df['attack_type'].apply(map_attack)

# Show sample
df[['attack_type', 'attack_category']].head(10)


Unnamed: 0,attack_type,attack_category
0,normal,normal
1,normal,normal
2,neptune,DoS
3,normal,normal
4,normal,normal
5,neptune,DoS
6,neptune,DoS
7,neptune,DoS
8,neptune,DoS
9,neptune,DoS


In [None]:
df.columns


Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'attack_type', 'difficulty_level',
       'attack_category'],
      dtype='object')

In [None]:
df['attack_category'].value_counts()


Unnamed: 0_level_0,count
attack_category,Unnamed: 1_level_1
normal,67343
DoS,45957
Probe,11656
R2L,995
U2R,22


In [None]:
df[['attack_type', 'attack_category']].sample(10)


Unnamed: 0,attack_type,attack_category
112883,normal,normal
58780,satan,Probe
7715,normal,normal
90745,normal,normal
52040,neptune,DoS
122680,normal,normal
14370,normal,normal
21917,normal,normal
8361,normal,normal
33051,neptune,DoS


In [None]:
df.shape


(125973, 44)

In [None]:
df.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,difficulty_level,attack_category
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,DoS
3,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21,normal
