In [18]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [3]:
df = pd.read_csv('anomaly.csv')

In [4]:
df.sample(5)

Unnamed: 0,typing_speed_cps,avg_pause_ms,max_pause_ms,edit_ratio,paste_events,avg_paste_size,revision_count,solution_time_sec,code_complexity,label,session_id
155,2.5,740,1876,0.27,0,0,19,1170,0.52,normal,S00463
589,4.17,453,2980,0.44,0,0,16,1649,0.32,normal,S00176
364,4.77,403,2308,0.47,0,0,19,1412,0.41,normal,S00165
594,0.95,2619,25632,0.03,3,323,3,442,0.89,anomalous,S00859
203,4.1,299,1175,0.39,0,0,20,1747,0.54,normal,S00558


In [7]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

In [8]:
df['label_encoded'].sample(5)

Unnamed: 0,label_encoded
344,1
410,1
810,1
563,0
764,1


In [10]:
X = df.drop(columns=['label', 'label_encoded', 'session_id'])
y = df['label_encoded']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled)
print(X_train_scaled)

[[ 0.28035841 -0.55280256 -0.54124089 ...  0.14269629  1.19045824
  -1.10843604]
 [ 1.39508955 -0.66147412 -0.52272251 ...  1.26188284  0.2482742
  -1.05490779]
 [ 1.38097903 -0.6823725  -0.51674506 ...  0.56239125  0.61304973
  -0.68021011]
 ...
 [ 0.01225851 -0.52249992 -0.56667436 ...  1.40178116  0.83851401
   0.01565701]
 [ 1.28926064 -0.66774364 -0.70017076 ...  0.84218788  0.66437493
  -0.46609715]
 [ 0.79539242 -0.51205073 -0.5938659  ...  1.54167948 -0.3034717
  -0.14492771]]
[[ 1.34570273 -0.73670828 -0.62422197 ...  0.42249293 -0.05234483
  -0.62668187]
 [-0.14295721 -0.62594688 -0.51815152 ...  0.28259461 -0.38229255
  -0.84079483]
 [ 1.02821601 -0.63117148 -0.63969302 ...  0.84218788  0.27210376
  -0.30551243]
 ...
 [ 0.67545299 -0.54130846 -0.56175175 ...  0.56239125  0.2482742
  -0.84079483]
 [-0.16412299 -0.73461844 -0.68282443 ...  1.54167948  0.6203819
   0.06918525]
 [ 0.14630846 -0.67558053 -0.71997839 ...  1.40178116 -0.07984047
  -0.73373835]]


In [16]:
lr = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    n_jobs=-1
)

lr.fit(X_train_scaled, y_train)

In [17]:
y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]

In [19]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob_lr))

Confusion Matrix:
 [[ 60   0]
 [  0 140]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        60
           1       1.00      1.00      1.00       140

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

ROC-AUC Score: 1.0


In [20]:
coeff_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": lr.coef_[0]
}).sort_values(by="coefficient", ascending=False)

print(coeff_df)

             feature  coefficient
0   typing_speed_cps     0.809591
3         edit_ratio     0.780334
6     revision_count     0.729638
7  solution_time_sec     0.714437
4       paste_events    -0.743618
8    code_complexity    -0.822492
1       avg_pause_ms    -0.873991
5     avg_paste_size    -0.877533
2       max_pause_ms    -0.909365
