In [33]:
import pandas as pd

In [34]:
def load_zeek_connlog(filepath):
    # open and parse header
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith("#fields"):
                columns = line.strip().split('\t')[1:]  # remove '#fields'
                break
    
    # read log into dataframe
    df = pd.read_csv(filepath, sep='\t', comment='#', names=columns)
    
    return df


In [35]:
df_malware = load_zeek_connlog(r"C:/zeek/logs_malware/conn.log")
df_benign = load_zeek_connlog(r"C:/zeek/logs_benin/conn.log")

In [36]:
df_malware['label'] = 1  # malware = 1
df_benign['label'] = 0  # benign = 0

In [37]:
# Replace missing or bad values
df_malware_clean = df_malware.replace(['-', '(empty)', 'null'], 0)
df_benign_clean  = df_benign.replace(['-', '(empty)', 'null'], 0)

# Convert numeric columns
numeric_cols = ['duration', 'orig_bytes', 'resp_bytes']
for col in numeric_cols:
    df_malware_clean[col] = pd.to_numeric(df_malware_clean[col], errors='coerce').fillna(0)
    df_benign_clean[col]  = pd.to_numeric(df_benign_clean[col], errors='coerce').fillna(0)


  df_malware_clean = df_malware.replace(['-', '(empty)', 'null'], 0)
  df_benign_clean  = df_benign.replace(['-', '(empty)', 'null'], 0)


In [38]:
print("Malware shape:", df_malware.shape)
print("Benign shape:", df_benign.shape)

Malware shape: (433, 22)
Benign shape: (853, 22)


In [39]:
df_malware.head()

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label
0,1749829000.0,CLZQuv3TZymz0uKBm5,10.6.13.133,52428,10.6.13.3,88,tcp,krb_tcp,0.004166,1909,...,-,-,0,ShADadFr,6,2161,6,2129,-,1
1,1749829000.0,CeOE0u3aSMORG6BA1j,10.6.13.133,52427,10.6.13.3,389,tcp,-,0.010668,2563,...,-,-,0,ShADdarF,8,2895,7,3339,-,1
2,1749829000.0,CeyqIBg2lghJCy9Sb,10.6.13.133,52431,23.192.223.206,80,tcp,http,0.069005,111,...,-,-,0,ShADadfF,5,323,5,399,-,1
3,1749829000.0,CIwFE43BtxJXbnebYe,10.6.13.133,52430,52.156.123.84,443,tcp,ssl,0.422278,672,...,-,-,0,ShADdaFf,12,1164,9,3539,-,1
4,1749829000.0,C6YWwu14dVBk8gSCph,10.6.13.133,61943,10.6.13.3,53,udp,dns,0.000354,39,...,-,-,0,Dd,1,67,1,146,-,1


In [40]:
df_all = pd.concat([df_malware_clean, df_benign_clean], ignore_index=True)

In [41]:
categorical_cols = []
if 'proto' in df_all.columns:
    categorical_cols.append('proto')
if 'conn_state' in df_all.columns:
    categorical_cols.append('conn_state')

if categorical_cols:
    df_all = pd.get_dummies(df_all, columns=categorical_cols)

# Separate X and y
X = df_all.drop(columns=['ts', 'uid', 'id.orig_h', 'id.orig_p', 
                         'id.resp_h', 'id.resp_p', 'label'])
y = df_all['label']


In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [43]:
X_train.dtypes


service              object
duration            float64
orig_bytes            int64
resp_bytes            int64
local_orig            int64
local_resp            int64
missed_bytes          int64
history              object
orig_pkts             int64
orig_ip_bytes         int64
resp_pkts             int64
resp_ip_bytes         int64
tunnel_parents        int64
proto_icmp             bool
proto_tcp              bool
proto_udp              bool
conn_state_OTH         bool
conn_state_RSTO        bool
conn_state_RSTR        bool
conn_state_RSTRH       bool
conn_state_S0          bool
conn_state_S1          bool
conn_state_S2          bool
conn_state_S3          bool
conn_state_SF          bool
conn_state_SH          bool
conn_state_SHR         bool
dtype: object

In [44]:
# Convert bool columns to int
bool_cols = X_train.select_dtypes(include=['bool']).columns
X_train[bool_cols] = X_train[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)


In [45]:
X_train = X_train.drop(columns=['service', 'history'])
X_test = X_test.drop(columns=['service', 'history'])


In [46]:
X_train.dtypes



duration            float64
orig_bytes            int64
resp_bytes            int64
local_orig            int64
local_resp            int64
missed_bytes          int64
orig_pkts             int64
orig_ip_bytes         int64
resp_pkts             int64
resp_ip_bytes         int64
tunnel_parents        int64
proto_icmp            int32
proto_tcp             int32
proto_udp             int32
conn_state_OTH        int32
conn_state_RSTO       int32
conn_state_RSTR       int32
conn_state_RSTRH      int32
conn_state_S0         int32
conn_state_S1         int32
conn_state_S2         int32
conn_state_S3         int32
conn_state_SF         int32
conn_state_SH         int32
conn_state_SHR        int32
dtype: object

In [48]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [50]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Build pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # scale features
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))  # classifier
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)


In [52]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict on test set
y_pred = pipeline.predict(X_test)

# Print evaluation metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[158   6]
 [  5  89]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       164
           1       0.94      0.95      0.94        94

    accuracy                           0.96       258
   macro avg       0.95      0.96      0.95       258
weighted avg       0.96      0.96      0.96       258



In [54]:
import joblib

joblib.dump(pipeline, 'malware_detection_model.pkl')
# Add this line to save the columns used for training
joblib.dump(X_train.columns, 'X_train_columns.pkl')
print("Model saved as malware_detection_model.pkl")
print("X_train columns saved as X_train_columns.pkl")

Model saved as malware_detection_model.pkl
X_train columns saved as X_train_columns.pkl


In [56]:
# Load and clean new conn.log
df_new = load_zeek_connlog(r"C:\zeek\logs_new\conn.log")

# Clean: replace missing/nulls with 0
df_new_clean = df_new.replace(['-', '(empty)', 'null'], 0)

# Convert numeric columns
for col in numeric_cols:
    df_new_clean[col] = pd.to_numeric(df_new_clean[col], errors='coerce').fillna(0)

# Encode categorical columns (proto, conn_state if they exist)
if categorical_cols:
    df_new_clean = pd.get_dummies(df_new_clean, columns=categorical_cols)

# ALIGN columns with training data
# 1. Add any missing columns as 0
missing_cols = set(X_train.columns) - set(df_new_clean.columns)
for col in missing_cols:
    df_new_clean[col] = 0

# 2. Reorder columns to exactly match X_train
df_new_clean = df_new_clean[X_train.columns]

# Predict
predictions = pipeline.predict(df_new_clean)

# Add result to dataframe
df_new['predicted_label'] = predictions
df_new['predicted_label'].value_counts()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\zeek\\logs_new\\conn.log'