In [1]:
import pandas as pd
# Read in data and display first 5 rows
features = pd.read_csv('rba-dataset.csv')
features.head()

Unnamed: 0,index,Login Timestamp,User ID,Round-Trip Time [ms],IP Address,Country,Region,City,ASN,User Agent String,Browser Name and Version,OS Name and Version,Device Type,Login Successful,Is Attack IP,Is Account Takeover
0,0,2020-02-03 12:43:30.772,-4324475583306591935,,10.0.65.171,NO,-,-,29695,Mozilla/5.0 (iPhone; CPU iPhone OS 13_4 like ...,Firefox 20.0.0.1618,iOS 13.4,mobile,False,False,False
1,1,2020-02-03 12:43:43.549,-4324475583306591935,,194.87.207.6,AU,-,-,60117,Mozilla/5.0 (Linux; Android 4.1; Galaxy Nexus...,Chrome Mobile 46.0.2490,Android 4.1,mobile,False,False,False
2,2,2020-02-03 12:43:55.873,-3284137479262433373,,81.167.144.58,NO,Vestland,Urangsvag,29695,Mozilla/5.0 (iPad; CPU OS 7_1 like Mac OS X) ...,Android 2.3.3.2672,iOS 7.1,mobile,True,False,False
3,3,2020-02-03 12:43:56.180,-4324475583306591935,,170.39.78.152,US,-,-,393398,Mozilla/5.0 (Linux; Android 4.1; Galaxy Nexus...,Chrome Mobile WebView 85.0.4183,Android 4.1,mobile,False,False,False
4,4,2020-02-03 12:43:59.396,-4618854071942621186,,10.0.0.47,US,Virginia,Ashburn,398986,Mozilla/5.0 (Linux; U; Android 2.2) Build/NMA...,Chrome Mobile WebView 85.0.4183,Android 2.2,mobile,False,True,False


## Preprocessing

In [2]:
# rename above columns to snake case
features = features.rename(columns={'Login Timestamp': 'login_timestamp', 'User ID': 'user_id', 'Round-Trip Time [ms]':'round_trip','Region':'region', 'City':'city', 'ASN':'asn', 'IP Address': 'ip_address', 'Country': 'country', 'User Agent String': 'user_agent_string','Device Type': 'device_type', 'Browser Name and Version': 'browser', 'Is Account Takeover':'is_account_takeover', 'OS Name and Version':'os_detail','Login Successful':'is_login_success','Is Attack IP':'is_attack_ip'})

In [5]:
# only keep rows with device type desktop
features = features[features.device_type == 'desktop']
# filter the DataFrame based on the length of column 'user_agent_string'
features = features[features['user_agent_string'].str.len() < 126]
features.shape


(3732298, 16)

In [7]:
df = features
df.to_csv('rba-dataset-renamed-desktop.csv', index=False)

In [6]:
import ipaddress

# Preprocess the data
def preprocess_data(df):
    df_encoded = encode_ip_addresses(df['ip_address'])
    df_encoded.columns = ['ip_' + str(col) for col in df_encoded.columns]
    df = pd.concat([df.drop(columns='ip_address'), df_encoded], axis=1)
    df = preprocess_user_agent(df, 'user_agent_string', 'device_type')
    df = preprocess_timestamp(df, 'login_timestamp')
    # Perform additional preprocessing steps if necessary
    return df

# Encode IP addresses using one-hot encoding
def encode_ip_addresses(ip_addresses):
    encoded_ips = []
    max_prefixlen = 0

    # Find the maximum prefix length among the IP addresses
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        max_prefixlen = max(max_prefixlen, ip_obj.max_prefixlen)

    # Perform one-hot encoding for each IP address
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        ip_binary = bin(int(ip_obj))[2:].zfill(max_prefixlen)
        encoded_ip = [int(bit) for bit in ip_binary]
        encoded_ips.append(encoded_ip)

    encoded_df = pd.DataFrame(encoded_ips)
    return encoded_df

# Preprocess the user agent column
def preprocess_user_agent(df, user_agent_column, device_type):
    df['device_type'] = df[device_type]
    df['operating_system'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[1])
    df['browser'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[2].split(')')[0] if len(x.split('(')) > 1 and len(x.split(';')) > 2 else '')
    df.drop(columns=[user_agent_column], inplace=True)
    return df

# Preprocess the login_timestamp column
def preprocess_timestamp(df, timestamp_column):
    df[timestamp_column] = pd.to_datetime(df[timestamp_column])
    df['hour'] = df[timestamp_column].dt.hour
    df['day_of_week'] = df[timestamp_column].dt.dayofweek
    df['month'] = df[timestamp_column].dt.month
    df.drop(columns=[timestamp_column], inplace=True)
    return df


In [7]:
# df = preprocess_data(df)

In [8]:
features_desktop = features[features['Device Type'] == 'desktop']

KeyError: 'Device Type'

In [None]:
features_desktop.shape

(7934515, 16)

## Feature list: 
1. Login Timestamp
2. Address
3. Country
4. User Agent String
5. Browser Name and Version
6. User ID
7. IP Address
8. Is Attack IP
9. Login Successful

In [None]:
features1k['Login Successful'] = features1k['Login Successful'].map({'False':0,'True':1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features1k['Login Successful'] = features1k['Login Successful'].map({'False':0,'True':1})


In [None]:
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz



In [None]:
features1k['Device Type'] = features1k['Device Type'].map({'mobile':1,'tablet':2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features1k['Device Type'] = features1k['Device Type'].map({'mobile':1,'tablet':2})


In [None]:
# Split the data into features (X) and target (y)
X = features1k.drop(['Login Successful', 'Login Timestamp', 'IP Address', 'Country', 'Region', 'City', 'User Agent String', 'Browser Name and Version', 'OS Name and Version'], axis=1)
y = features1k['Login Successful']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

ValueError: could not convert string to float: 'Mozilla/5.0  (iPad; CPU OS 7_1 like Mac OS X) AppleWebKit/533.1 (KHTML, like Gecko Version/4.0 Mobile Safari/533.1 variation/277457'