In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
os.environ['MLFLOW_TRACKING_USERNAME'] = 'sinanshamsudheen'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'cd101ef722400d4771c17b6f8a661487a22abd3b'

In [3]:
df = pd.read_csv("noisy_balanced_synthetic_auth_dataset.csv")
df.sample(5)

Unnamed: 0,source_ip,user,event_type,timestamp,attempts_in_30s,unique_users_in_30s,invalid_user,success_after_fail,bruteforce
1793,10.0.215.106,admin,failed,2025-06-20 19:16:05.381193,5,2,1,0,1
1504,192.168.205.170,bob,failed,2025-06-20 19:48:24.319895,4,4,1,0,0
822,10.0.245.87,root,failed,2025-06-20 19:52:29.420936,7,2,1,0,1
1861,10.0.195.161,root,failed,2025-06-20 19:09:21.373765,9,8,1,0,1
1500,10.0.89.17,root,failed,2025-06-20 19:24:54.415642,7,4,0,0,1


In [4]:
df.dtypes

source_ip              object
user                   object
event_type             object
timestamp              object
attempts_in_30s         int64
unique_users_in_30s     int64
invalid_user            int64
success_after_fail      int64
bruteforce              int64
dtype: object

In [5]:
df.event_type.value_counts()

event_type
failed     1100
success     900
Name: count, dtype: int64

In [6]:
df['event_success'] = df['event_type'].apply(lambda x: 1 if x == 'success' else 0)

In [7]:
df.user.value_counts()

user
admin      433
alice      253
bob        251
charlie    246
user       226
guest      198
test       198
root       195
Name: count, dtype: int64

In [8]:
X = df.drop(columns=["user",'source_ip','timestamp','event_type','bruteforce'])
y = df.bruteforce

In [9]:
X.sample(10)

Unnamed: 0,attempts_in_30s,unique_users_in_30s,invalid_user,success_after_fail,event_success
1499,17,6,0,0,0
1170,1,1,0,0,0
445,1,1,0,1,1
1279,17,2,1,0,0
1796,1,1,0,1,1
1377,1,1,0,1,1
687,3,2,1,1,1
546,1,1,0,0,0
1337,1,1,0,1,1
1434,1,1,0,1,1


In [10]:
X.columns

Index(['attempts_in_30s', 'unique_users_in_30s', 'invalid_user',
       'success_after_fail', 'event_success'],
      dtype='object')

In [11]:
df.bruteforce.value_counts()

bruteforce
1    1000
0    1000
Name: count, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
models = [
    (
        "Logistic Regression", 
        {'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'},
        LogisticRegression(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest", 
        {"n_estimators": 30, "max_depth": 3},
        RandomForestClassifier(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier",
        {"use_label_encoder": False, "eval_metric": 'logloss'},
        XGBClassifier(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
]

In [14]:
reports = []

for model_name, params, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    
    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [15]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost

In [16]:
import dagshub
dagshub.init(repo_owner='sinanshamsudheen', repo_name='TLflow', mlflow=True)

In [17]:
# Ideally you will not require following 4 lines if you have started fresh and do not have any previous dagshub credentials on your computer
# import os
# os.environ['MLFLOW_TRACKING_USERNAME'] = 'your user name' # 'learnpythonlanguage'
# os.environ['MLFLOW_TRACKING_PASSWORD'] = 'your password' # 
os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/sinanshamsudheen/TLflow.mlflow' # https://dagshub.com/learnpythonlanguage/mlflow_dagshub_demo.mlflow

# Initialize MLflow
mlflow.set_experiment("Bruteforce Detection")
# mlflow.set_tracking_uri("http://localhost:5000")

for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):        
        mlflow.log_params(params)
        mlflow.log_metrics({
            'accuracy': report['accuracy'],
            'precision_class_1': report['1']['precision'],
            'precision_class_0': report['0']['precision'],
            'recall_class_1': report['1']['recall'],
            'recall_class_0': report['0']['recall'],
            'f1_score_macro': report['macro avg']['f1-score']
        })  
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")

In [18]:
print(reports)

[{'0': {'precision': 0.9605911330049262, 'recall': 0.975, 'f1-score': 0.9677419354838709, 'support': 200.0}, '1': {'precision': 0.9746192893401016, 'recall': 0.96, 'f1-score': 0.9672544080604534, 'support': 200.0}, 'accuracy': 0.9675, 'macro avg': {'precision': 0.9676052111725139, 'recall': 0.9675, 'f1-score': 0.9674981717721621, 'support': 400.0}, 'weighted avg': {'precision': 0.9676052111725139, 'recall': 0.9675, 'f1-score': 0.9674981717721621, 'support': 400.0}}, {'0': {'precision': 0.964824120603015, 'recall': 0.96, 'f1-score': 0.9624060150375939, 'support': 200.0}, '1': {'precision': 0.9601990049751243, 'recall': 0.965, 'f1-score': 0.9625935162094762, 'support': 200.0}, 'accuracy': 0.9625, 'macro avg': {'precision': 0.9625115627890697, 'recall': 0.9624999999999999, 'f1-score': 0.962499765623535, 'support': 400.0}, 'weighted avg': {'precision': 0.9625115627890696, 'recall': 0.9625, 'f1-score': 0.9624997656235351, 'support': 400.0}}, {'0': {'precision': 0.9693877551020408, 'recall':

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Logistic Regression model
lr = LogisticRegression()

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],                # Regularization strength
    'penalty': ['l1', 'l2'],                     # Regularization type
    'solver': ['liblinear', 'saga'],             # Solvers compatible with l1 and l2
    'max_iter': [1000, 2000, 3000]                  # Max iterations
}

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    cv=5,
    scoring='f1',     # or 'accuracy', 'roc_auc', etc.
    verbose=2,
    n_jobs=-1
)

# Train on training set
grid_search.fit(X_train, y_train)

# Output best parameters and best score
print("✅ Best Parameters:", grid_search.best_params_)
print("🏆 Best F1 Score (CV):", grid_search.best_score_)

# Predict on test set
y_pred = grid_search.predict(X_test)

# Evaluate performance
print("\n📊 Classification Report on Test Set:\n")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 60 candidates, totalling 300 fits
✅ Best Parameters: {'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'}
🏆 Best F1 Score (CV): 0.974368093365548

📊 Classification Report on Test Set:

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       200
           1       0.97      0.96      0.97       200

    accuracy                           0.97       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.97      0.97      0.97       400



In [20]:
model_LR = LogisticRegression(
    C=1,
    max_iter=1000,
    penalty='l2',
    solver='liblinear'
)
model_LR.fit(X_train, y_train)

In [21]:
y_pred = model_LR.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       200
           1       0.97      0.96      0.97       200

    accuracy                           0.97       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.97      0.97      0.97       400



In [23]:
import joblib
joblib.dump(model_LR, 'bruteforce_model.pkl')

['bruteforce_model.pkl']

[CV] END C=0.01, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END .....C=0.01, max_iter=1000, penalty=l2, solver=saga; total time=   0.0s
[CV] END C=0.01, max_iter=2000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END C=0.01, max_iter=2000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END C=0.01, max_iter=2000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END C=0.01, max_iter=2000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .....C=0.01, max_iter=2000, penalty=l2, solver=saga; total time=   0.1s
[CV] END C=0.01, max_iter=3000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=1000, penalty=l1, solver=liblinear; total time=   0.0s
[CV] END ......C=0.1, max_iter=1000, penalty=l1, solver=saga; total time=   0.1s
[CV] END .C=0.1, max_iter=2000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=2000, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .C=0.1, max_iter=30

In [24]:
X.columns

Index(['attempts_in_30s', 'unique_users_in_30s', 'invalid_user',
       'success_after_fail', 'event_success'],
      dtype='object')