In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [88]:
# Step 1: Load the data
file_path = '/content/Train_data.csv'
data = pd.read_csv(file_path)

# Step 2: Inspect the data
print("Data Sample:\n", data.head())
print("\nData Info:\n")
data.info()

Data Sample:
    duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1                    0.00                    0.60   
2                    0.10              

In [89]:
# Step 3: Handle missing values
# Separate numerical and categorical columns
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data.select_dtypes(include=['object']).columns
data_class = data['class']
len(data_class)
data_class.value_counts()


Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
normal,13449
anomaly,11743


In [90]:
# Impute missing values
num_imputer = SimpleImputer(strategy='mean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])

In [91]:
# Step 4: Encode categorical variables

encoder = OneHotEncoder(sparse_output=False)
encoded_cat = encoder.fit_transform(data[cat_cols])
print(encoded_cat)
encoded_cat_df = pd.DataFrame(encoded_cat, columns=encoder.get_feature_names_out(cat_cols))


[[0. 1. 0. ... 0. 0. 1.]
 [0. 0. 1. ... 0. 0. 1.]
 [0. 1. 0. ... 0. 1. 0.]
 ...
 [0. 1. 0. ... 0. 1. 0.]
 [0. 1. 0. ... 0. 1. 0.]
 [0. 1. 0. ... 0. 1. 0.]]


In [92]:

encoded_cat_df.columns

Index(['protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp',
       'service_IRC', 'service_X11', 'service_Z39_50', 'service_auth',
       'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf',
       'service_daytime', 'service_discard', 'service_domain',
       'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i',
       'service_efs', 'service_exec', 'service_finger', 'service_ftp',
       'service_ftp_data', 'service_gopher', 'service_hostnames',
       'service_http', 'service_http_443', 'service_http_8001',
       'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell',
       'service_ldap', 'service_link', 'service_login', 'service_mtp',
       'service_name', 'service_netbios_dgm', 'service_netbios_ns',
       'service_netbios_ssn', 'service_netstat', 'service_nnsp',
       'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump',
       'service_pop_2', 'service_pop_3', 'service_printer', 'service_private',
  

In [93]:
# Add encoded columns back to the data
data = pd.concat([data[num_cols], encoded_cat_df], axis=1)
#data.shape


below code is to print vales only

In [63]:
row_index = 25190
row_data = data.iloc[row_index]

print("\nData of Row at Index 360:")
print(row_data)


Data of Row at Index 360:
duration          0.0
src_bytes         0.0
dst_bytes         0.0
land              0.0
wrong_fragment    0.0
                 ... 
flag_S2           0.0
flag_S3           0.0
flag_SF           0.0
flag_SH           0.0
class_normal      0.0
Name: 25190, Length: 116, dtype: float64


In [94]:
# Step 6: Scale numerical features
scaler = StandardScaler()
#get last column values as target and drop target from train data features
# Drop the column by index (e.g., drop the second column at index 1)
X = data.iloc[:, :-1]  # All columns except the last
y = data.iloc[:, -1]   # Only the last column

print("\nFeatures (X):")
print(X)

print("\nTarget (y):")
print(y)



Features (X):
       duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0           0.0      491.0        0.0   0.0             0.0     0.0  0.0   
1           0.0      146.0        0.0   0.0             0.0     0.0  0.0   
2           0.0        0.0        0.0   0.0             0.0     0.0  0.0   
3           0.0      232.0     8153.0   0.0             0.0     0.0  0.0   
4           0.0      199.0      420.0   0.0             0.0     0.0  0.0   
...         ...        ...        ...   ...             ...     ...  ...   
25187       0.0        0.0        0.0   0.0             0.0     0.0  0.0   
25188       0.0      334.0        0.0   0.0             0.0     0.0  0.0   
25189       0.0        0.0        0.0   0.0             0.0     0.0  0.0   
25190       0.0        0.0        0.0   0.0             0.0     0.0  0.0   
25191       0.0        0.0        0.0   0.0             0.0     0.0  0.0   

       num_failed_logins  logged_in  num_compromised  ...  flag_RSTO  \


In [95]:
X = scaler.fit_transform(X)

In [96]:
# Step 7: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [97]:
# Step 8: Train a machine learning model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [99]:
# Step 9: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\nModel Accuracy:", accuracy)
print("\nPredicted Output:", y_pred)
row_index = 10
X_test_df = pd.DataFrame(y_pred)  # Create a DataFrame from X_test
row_data = X_test_df.iloc[row_index] #Now access using iloc

print("\nPredict Data of Row :")
print(row_data)

y_test_df = pd.DataFrame(y_test)  # Create a DataFrame from X_test
yrow_data = y_test_df.iloc[row_index] #Now access using iloc

print("\n Actual Data of Row :")
print(yrow_data)


Model Accuracy: 1.0

Predicted Output: [0. 1. 0. ... 1. 1. 1.]

Predict Data of Row :
0    1.0
Name: 10, dtype: float64

 Actual Data of Row :
class_normal    1.0
Name: 2619, dtype: float64
