### Isolation Forest Demo to Detect Insider Threat from Access Log

In [2]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [16]:
# Step 1: Load the dataset
df = pd.read_csv('synthetic_access_logs.csv')

In [18]:
df.head()

Unnamed: 0,user_id,timestamp,login_location,access_type,resource,device_type
0,user_20,2023-01-04 20:00:00,Bangalore,write,DevOps Dashboard,Desktop
1,user_30,2023-01-23 13:00:00,New York,read,Email,Desktop
2,user_50,2023-01-09 13:00:00,Bangalore,upload,DevOps Dashboard,Mobile
3,user_1,2023-01-11 18:00:00,New York,download,HR Portal,Mobile
4,user_29,2023-01-25 09:00:00,London,read,HR Portal,Desktop


In [20]:
# Step 2: Convert timestamp to numerical features (hour and day of week)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek

# Create human-readable flags
df['late_night'] = df['hour'].apply(lambda h: h < 6 or h > 22)
df['unknown_device'] = df['device_type'].str.lower().str.contains('unknown|unregistered')
df['sensitive_resource'] = df['resource'].str.lower().str.contains('sensitive|finance')
df['unusual_location'] = df['login_location'].str.lower().str.contains('unknown')


In [22]:
df.head()

Unnamed: 0,user_id,timestamp,login_location,access_type,resource,device_type,hour,day_of_week,late_night,unknown_device,sensitive_resource,unusual_location
0,user_20,2023-01-04 20:00:00,Bangalore,write,DevOps Dashboard,Desktop,20,2,False,False,False,False
1,user_30,2023-01-23 13:00:00,New York,read,Email,Desktop,13,0,False,False,False,False
2,user_50,2023-01-09 13:00:00,Bangalore,upload,DevOps Dashboard,Mobile,13,0,False,False,False,False
3,user_1,2023-01-11 18:00:00,New York,download,HR Portal,Mobile,18,2,False,False,False,False
4,user_29,2023-01-25 09:00:00,London,read,HR Portal,Desktop,9,2,False,False,False,False


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5050 entries, 0 to 5049
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   user_id             5050 non-null   object        
 1   timestamp           5050 non-null   datetime64[ns]
 2   login_location      5050 non-null   object        
 3   access_type         5050 non-null   object        
 4   resource            5050 non-null   object        
 5   device_type         5050 non-null   object        
 6   hour                5050 non-null   int32         
 7   day_of_week         5050 non-null   int32         
 8   late_night          5050 non-null   bool          
 9   unknown_device      5050 non-null   bool          
 10  sensitive_resource  5050 non-null   bool          
 11  unusual_location    5050 non-null   bool          
dtypes: bool(4), datetime64[ns](1), int32(2), object(5)
memory usage: 296.0+ KB


In [26]:
# Step 3: Encode categorical features to numeric
categorical_cols = ['user_id', 'login_location', 'access_type', 'resource', 'device_type']
encoder = LabelEncoder()

for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

In [30]:
df.head()

Unnamed: 0,user_id,timestamp,login_location,access_type,resource,device_type,hour,day_of_week,late_night,unknown_device,sensitive_resource,unusual_location
0,12,2023-01-04 20:00:00,0,4,1,0,20,2,False,False,False,False
1,23,2023-01-23 13:00:00,3,2,2,0,13,0,False,False,False,False
2,45,2023-01-09 13:00:00,0,3,1,2,13,0,False,False,False,False
3,0,2023-01-11 18:00:00,3,1,4,2,18,2,False,False,False,False
4,21,2023-01-25 09:00:00,2,2,4,0,9,2,False,False,False,False


In [32]:
# Step 4: Select features for model training
feature_cols = ['user_id', 'login_location', 'access_type', 'resource', 'device_type', 'hour', 'day_of_week']
X = df[feature_cols]

In [34]:
# Step 5: Train Isolation Forest
# contamination tells the model approximately what fraction of data are anomalies
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
model.fit(X)

In [36]:
# Step 6: Predict anomalies (-1 = anomaly, 1 = normal)
df['anomaly_score'] = model.decision_function(X)
df['anomaly'] = model.predict(X)

In [46]:
# Step 7: Filter and display the anomalies
anomalies = df[df['anomaly'] == -1]

print(len(anomalies), "- Anomalies Detected:")
anomalies[['user_id', 'timestamp', 'login_location', 'access_type', 'resource', 'device_type', 'hour', 'anomaly_score']]

51 - Anomalies Detected:


Unnamed: 0,user_id,timestamp,login_location,access_type,resource,device_type,hour,anomaly_score
76,35,2023-01-19 02:00:00,5,0,5,4,2,-0.07678
204,25,2023-01-23 03:00:00,5,0,5,4,3,-0.084707
301,1,2023-01-23 02:00:00,5,0,5,4,2,-0.096721
333,21,2023-01-12 01:00:00,5,0,5,4,1,-0.087764
349,10,2023-01-15 01:00:00,5,0,5,4,1,-0.093596
427,7,2023-01-30 01:00:00,5,0,5,4,1,-0.095966
627,20,2023-01-13 03:00:00,5,0,5,4,3,-0.080151
805,11,2023-01-24 01:00:00,5,0,5,4,1,-0.09181
862,15,2023-01-14 02:00:00,5,0,5,4,2,-0.082722
1186,47,2023-01-12 02:00:00,5,0,5,4,2,-0.083912


In [42]:
# Add Explanation Column
def explain(row):
    reasons = []
    if row['late_night']:
        reasons.append("Late night login")
    if row['unknown_device']:
        reasons.append("Unknown device")
    if row['sensitive_resource']:
        reasons.append("Accessed sensitive system")
    if row['unusual_location']:
        reasons.append("Unusual login location")
    return ", ".join(reasons)

df['explanation'] = df.apply(explain, axis=1)

# Show some flagged anomalies with reasons
flagged = df[df['anomaly'] == -1][
    ['user_id', 'timestamp', 'login_location', 'resource', 'device_type', 'explanation']
]

flagged


Unnamed: 0,user_id,timestamp,login_location,resource,device_type,explanation
76,35,2023-01-19 02:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
204,25,2023-01-23 03:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
301,1,2023-01-23 02:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
333,21,2023-01-12 01:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
349,10,2023-01-15 01:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
427,7,2023-01-30 01:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
627,20,2023-01-13 03:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
805,11,2023-01-24 01:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
862,15,2023-01-14 02:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
1186,47,2023-01-12 02:00:00,5,5,4,"Late night login, Unknown device, Accessed sen..."
