### Synthetic Data Generation for Isolation Forest

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Seed for reproducibility
np.random.seed(42)
random.seed(42)

# Define parameters
n_users = 50
n_records = 5000
start_date = datetime(2023, 1, 1)
locations = ['New York', 'San Francisco', 'London', 'Berlin', 'Bangalore']
devices = ['Desktop', 'Laptop', 'Mobile', 'Tablet']
resources = ['Email', 'HR Portal', 'CRM', 'Finance System', 'DevOps Dashboard']
access_types = ['read', 'write', 'download', 'upload']

# Generate user_ids
user_ids = [f'user_{i}' for i in range(1, n_users + 1)]

# Function to create random timestamps
def random_timestamp():
    base = start_date + timedelta(days=random.randint(0, 30))
    time = timedelta(hours=random.randint(8, 20))  # normal working hours
    return base + time

# Generate synthetic normal data
data = []
for _ in range(n_records):
    user = random.choice(user_ids)
    ts = random_timestamp()
    location = random.choice(locations)
    access_type = random.choice(access_types)
    resource = random.choice(resources)
    device = random.choice(devices)
    data.append([user, ts, location, access_type, resource, device])

df = pd.DataFrame(data, columns=[
    'user_id', 'timestamp', 'login_location', 'access_type', 'resource', 'device_type'
])

# Inject anomalies
n_anomalies = 50
for _ in range(n_anomalies):
    user = random.choice(user_ids)
    ts = start_date + timedelta(days=random.randint(0, 30), hours=random.choice([1, 2, 3]))  # late night
    location = 'Unknown-Country'
    access_type = 'admin_override'
    resource = 'Sensitive Database'
    device = 'Unregistered_Device'
    df.loc[len(df)] = [user, ts, location, access_type, resource, device]

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Save or preview
print(df.head())
df.to_csv('synthetic_access_logs.csv', index=False)


   user_id           timestamp login_location access_type          resource  \
0  user_20 2023-01-04 20:00:00      Bangalore       write  DevOps Dashboard   
1  user_30 2023-01-23 13:00:00       New York        read             Email   
2  user_50 2023-01-09 13:00:00      Bangalore      upload  DevOps Dashboard   
3   user_1 2023-01-11 18:00:00       New York    download         HR Portal   
4  user_29 2023-01-25 09:00:00         London        read         HR Portal   

  device_type  
0     Desktop  
1     Desktop  
2      Mobile  
3      Mobile  
4     Desktop  


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5050 entries, 0 to 5049
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   user_id         5050 non-null   object        
 1   timestamp       5050 non-null   datetime64[ns]
 2   login_location  5050 non-null   object        
 3   access_type     5050 non-null   object        
 4   resource        5050 non-null   object        
 5   device_type     5050 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 236.8+ KB
