In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample dataset
data = {
    'ip_address': ['192.168.0.1', '10.0.0.1', '172.16.0.1'],
    'timestamp': ['2023-05-01 09:30:00', '2023-05-02 15:45:00', '2023-05-03 11:20:00'],
    'user_agent': ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1'
                   ],
    'access_token': ['eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkRvZSBNYXJrIiwiaWF0IjoxNTE2MjM5MDIyLCJleHAiOjE1MTYyMzkwMjJ9._vWrkmT3Dn29zO5Wq5J9gWUEjV8_eycAbj-xGdpO3Fc',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IldvbGZCb3giLCJpYXQiOjE1MTYyMzkwMjIsImV4cCI6MTUxNjIzOTAyMn0.0-1MqoQXCrkAB6S8cNExx0ULPOWgMi4rhvTXeJu3pSo'
                   ],
    'username': ['user1', 'user2', 'user3'],
    'login_successful': [True, False, True]
}

df = pd.DataFrame(data)

import ipaddress

# Preprocess the data
def preprocess_data(df):
    df_encoded = encode_ip_addresses(df['ip_address'])
    df_encoded.columns = ['ip_' + str(col) for col in df_encoded.columns]
    df = pd.concat([df.drop(columns='ip_address'), df_encoded], axis=1)
    df = preprocess_user_agent(df, 'user_agent_string')
    df = preprocess_timestamp(df, 'login_timestamp')
    # Perform additional preprocessing steps if necessary
    return df

# Encode IP addresses using one-hot encoding
def encode_ip_addresses(ip_addresses):
    encoded_ips = []
    max_prefixlen = 0

    # Find the maximum prefix length among the IP addresses
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        max_prefixlen = max(max_prefixlen, ip_obj.max_prefixlen)

    # Perform one-hot encoding for each IP address
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        ip_binary = bin(int(ip_obj))[2:].zfill(max_prefixlen)
        encoded_ip = [int(bit) for bit in ip_binary]
        encoded_ips.append(encoded_ip)

    encoded_df = pd.DataFrame(encoded_ips)
    return encoded_df

# Preprocess the user agent column
def preprocess_user_agent(df, user_agent_column):
    df['device_type'] = 'desktop'
    df['operating_system'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[1] if isinstance(x, str) and len(x.split('(')) > 1 and len(x.split(';')) > 1 else 'unknown')
    df['browser'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[2].split(')')[0] if isinstance(x, str) and len(x.split('(')) > 1 and len(x.split(';')) > 2 else 'unknown')
    df.drop(columns=[user_agent_column], inplace=True)
    return df

# Preprocess the login_timestamp column
def preprocess_timestamp(df, timestamp_column):
    df[timestamp_column] = pd.to_datetime(df[timestamp_column])
    df['hour'] = df[timestamp_column].dt.hour
    df['day_of_week'] = df[timestamp_column].dt.dayofweek
    df['month'] = df[timestamp_column].dt.month
    df.drop(columns=[timestamp_column], inplace=True)
    return df

df = preprocess_data(df)

# Prepare the features and target
features = df.drop(columns='login_successful')
target = df['login_successful']

# One-hot encode the categorical features
features_encoded = pd.get_dummies(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


NameError: name 'ipaddress' is not defined

In [30]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Prepare the data
data = {
    'ip_address': ['192.168.0.1', '10.0.0.1', '172.16.0.1'],
    'timestamp': ['2023-05-01 09:30:00', '2023-05-02 15:45:00', '2023-05-03 11:20:00'],
    'user_agent': ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1'
                   ],
    'access_token': ['eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkRvZSBNYXJrIiwiaWF0IjoxNTE2MjM5MDIyLCJleHAiOjE1MTYyMzkwMjJ9._vWrkmT3Dn29zO5Wq5J9gWUEjV8_eycAbj-xGdpO3Fc',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IldvbGZCb3giLCJpYXQiOjE1MTYyMzkwMjIsImV4cCI6MTUxNjIzOTAyMn0.0-1MqoQXCrkAB6S8cNExx0ULPOWgMi4rhvTXeJu3pSo'
                   ],
    'login_successful': [True, False, True]
}

df = pd.DataFrame(data)

# Preprocess the data
def preprocess_data(df):
    df_encoded = encode_ip_addresses(df['ip_address'])
    df_encoded.columns = ['ip_' + str(col) for col in df_encoded.columns]
    df = pd.concat([df.drop(columns='ip_address'), df_encoded], axis=1)
    df = preprocess_user_agent(df, 'user_agent')
    df = preprocess_timestamp(df, 'timestamp')
    df = preprocess_access_token(df, 'access_token')
    # Perform additional preprocessing steps if necessary
    return df

# Encode IP addresses using one-hot encoding
def encode_ip_addresses(ip_addresses):
    encoded_ips = []
    max_prefixlen = 0

    # Find the maximum prefix length among the IP addresses
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        max_prefixlen = max(max_prefixlen, ip_obj.max_prefixlen)

    # Perform one-hot encoding for each IP address
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        ip_binary = bin(int(ip_obj))[2:].zfill(max_prefixlen)
        encoded_ip = [int(bit) for bit in ip_binary]
        encoded_ips.append(encoded_ip)

    encoded_df = pd.DataFrame(encoded_ips)
    return encoded_df

# Preprocess the user agent column
def preprocess_user_agent(df, user_agent_column):
    df['device_type'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[0])
    df['operating_system'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[1])
    df['browser'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[2].split(')')[0] if len(x.split('(')) > 1 and len(x.split(';')) > 2 else '')
    df.drop(columns=[user_agent_column], inplace=True)
    return df

# Preprocess the timestamp column
def preprocess_timestamp(df, timestamp_column):
    df[timestamp_column] = pd.to_datetime(df[timestamp_column])
    df['hour'] = df[timestamp_column].dt.hour
    df['day_of_week'] = df[timestamp_column].dt.dayofweek
    df['month'] = df[timestamp_column].dt.month
    df.drop(columns=[timestamp_column], inplace=True)
    return df

def preprocess_access_token(df, access_token_column):
    # Extract relevant features from the access token
    df['token_length'] = df[access_token_column].apply(lambda x: len(x))
    # Add more relevant features based on the specific JWT structure or requirements

    # Drop the original access token column
    df.drop(columns=[access_token_column], inplace=True)

    return df

df = preprocess_data(df)

# Prepare the features and target
features = df.drop(columns='login_successful')
target = df['login_successful']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


ValueError: could not convert string to float: 'Macintosh'

In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def preprocess_user_agent(df, user_agent_column):
    df['device_type'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[0])
    df['operating_system'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[1])
    df['browser'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[2].split(')')[0] if len(x.split('(')) > 1 and len(x.split(';')) > 2 else '')
    df.drop(columns=[user_agent_column], inplace=True)
    return df

# Example usage
data = {
    'user_agent': ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
                   ]
}
df = pd.DataFrame(data)

# Preprocess the User-Agent column
df_preprocessed = preprocess_user_agent(df, 'user_agent')

# Perform one-hot encoding for categorical features
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df_preprocessed[['device_type', 'operating_system', 'browser']])

# Create a new DataFrame with the encoded features
df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names(['device_type', 'operating_system', 'browser']))
df_final = pd.concat([df_preprocessed, df_encoded], axis=1)
df_final.drop(columns=['device_type', 'operating_system', 'browser'], inplace=True)

print(df_final)


   device_type_Macintosh  device_type_Windows NT 10.0  device_type_iPhone  \
0                    0.0                          1.0                 0.0   
1                    1.0                          0.0                 0.0   
2                    0.0                          0.0                 1.0   
3                    0.0                          1.0                 0.0   
4                    1.0                          0.0                 0.0   
5                    0.0                          0.0                 1.0   
6                    0.0                          1.0                 0.0   
7                    1.0                          0.0                 0.0   
8                    0.0                          0.0                 1.0   
9                    0.0                          1.0                 0.0   

   operating_system_ CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15   \
0                                                0.0                    



In [23]:
import pandas as pd

def preprocess_access_token(df, access_token_column):
    # Extract relevant features from the access token
    df['token_length'] = df[access_token_column].apply(lambda x: len(x))
    df['has_expiration'] = df[access_token_column].apply(lambda x: 'exp' in x)
    # Add more relevant features based on the specific JWT structure or requirements

    # Drop the original access token column
    df.drop(columns=[access_token_column], inplace=True)

    return df

# Example usage
data = {
        'access_token': ['eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkRvZSBNYXJrIiwiaWF0IjoxNTE2MjM5MDIyLCJleHAiOjE1MTYyMzkwMjJ9._vWrkmT3Dn29zO5Wq5J9gWUEjV8_eycAbj-xGdpO3Fc',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IldvbGZCb3giLCJpYXQiOjE1MTYyMzkwMjIsImV4cCI6MTUxNjIzOTAyMn0.0-1MqoQXCrkAB6S8cNExx0ULPOWgMi4rhvTXeJu3pSo',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkRvZSBNYXJrIiwiaWF0IjoxNTE2MjM5MDIyLCJleHAiOjE1MTYyMzkwMjJ9._vWrkmT3Dn29zO5Wq5J9gWUEjV8_eycAbj-xGdpO3Fc',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IldvbGZCb3giLCJpYXQiOjE1MTYyMzkwMjIsImV4cCI6MTUxNjIzOTAyMn0.0-1MqoQXCrkAB6S8cNExx0ULPOWgMi4rhvTXeJu3pSo',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkRvZSBNYXJrIiwiaWF0IjoxNTE2MjM5MDIyLCJleHAiOjE1MTYyMzkwMjJ9._vWrkmT3Dn29zO5Wq5J9gWUEjV8_eycAbj-xGdpO3Fc',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IldvbGZCb3giLCJpYXQiOjE1MTYyMzkwMjIsImV4cCI6MTUxNjIzOTAyMn0.0-1MqoQXCrkAB6S8cNExx0ULPOWgMi4rhvTXeJu3pSo',
                     'eyJraWQiOiJacEdwVlZ5ZE9wZ0hpems1a0FOdExPZndzeWhOQVhzQUlBVk1YQXpreDhrPSIsImFsZyI6IlJTMjU2In0.eyJzdWIiOiIyaGNoZzhjdTg3ZTJybGdvZDdudnFiZ2ZrYSIsInRva2VuX3VzZSI6ImFjY2VzcyIsInNjb3BlIjoiYWZmaWxpYXRlLXVzZXItcG9vbFwvYnVzIiwiYXV0aF90aW1lIjoxNjg5MTY3MTU5LCJpc3MiOiJodHRwczpcL1wvY29nbml0by1pZHAuYXAtc291dGhlYXN0LTEuYW1hem9uYXdzLmNvbVwvYXAtc291dGhlYXN0LTFfS2VneDRiNmxTIiwiZXhwIjoxNjg5MTcwNzU5LCJpYXQiOjE2ODkxNjcxNTksInZlcnNpb24iOjIsImp0aSI6Ijg0OTFkMTVhLTIzYmItNDUxNy05M2FjLThjYTU5MzYxZjlhOCIsImNsaWVudF9pZCI6IjJoY2hnOGN1ODdlMnJsZ29kN252cWJnZmthIn0.Hxv--gPei131IKFfkI9GNrZibo4mxt0S62KNX0OBJD3ocertG-sme28JCH4fuO4mf6rnbIgbSw7CZD5Ke6rDFPqOXsaggqkSPUaEV_rlcqvN9-rZ2g8-YlX0sbPWjwxYreiUDfclSSJ6PkPKfDWnSRk3srXe_7rmUsvsZHONXmw1of_J3cHRgdBFt36NNy7pvq3SJQ7tmH1JOUIiFleuSc1uTcgE_M_HkGZgb4tp8zNxgYBuCWIQfsqyV_0L6zcxL7sWbR96_mF1aA_I7WqD6Cl6jRJl1mTWwjwtreVphqV6OUIZSeWX6fNkiCl_YdGamjvn24FmwC0RI6XpsWORKQ'
                   ]
}
df = pd.DataFrame(data)

# Preprocess the access token column
df_preprocessed = preprocess_access_token(df, 'access_token')
print(df_preprocessed)


   token_length  has_expiration
0           155           False
1           177           False
2           176           False
3           155           False
4           177           False
5           176           False
6           155           False
7           177           False
8           176           False
9           878           False


In [19]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Prepare the data
data = {
    'ip_address': ['192.168.0.1', '10.0.0.1', '172.16.0.1'],
    'timestamp': ['2023-05-01 09:30:00', '2023-05-02 15:45:00', '2023-05-03 11:20:00'],
    'user_agent': ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
                   'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1'
                   ],
    'access_token': ['eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkRvZSBNYXJrIiwiaWF0IjoxNTE2MjM5MDIyLCJleHAiOjE1MTYyMzkwMjJ9._vWrkmT3Dn29zO5Wq5J9gWUEjV8_eycAbj-xGdpO3Fc',
                     'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IldvbGZCb3giLCJpYXQiOjE1MTYyMzkwMjIsImV4cCI6MTUxNjIzOTAyMn0.0-1MqoQXCrkAB6S8cNExx0ULPOWgMi4rhvTXeJu3pSo'
                   ],
    'login_successful': [True, False, True]
}

df = pd.DataFrame(data)

# Preprocess the data
def preprocess_data(df):
    df_encoded = encode_ip_addresses(df['ip_address'])
    df_encoded.columns = ['ip_' + str(col) for col in df_encoded.columns]
    df = pd.concat([df.drop(columns='ip_address'), df_encoded], axis=1)
    df = preprocess_user_agent(df, 'user_agent')
    df = preprocess_timestamp(df, 'timestamp')
    # Perform additional preprocessing steps if necessary
    return df

# Encode IP addresses using one-hot encoding
def encode_ip_addresses(ip_addresses):
    encoded_ips = []
    max_prefixlen = 0

    # Find the maximum prefix length among the IP addresses
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        max_prefixlen = max(max_prefixlen, ip_obj.max_prefixlen)

    # Perform one-hot encoding for each IP address
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        ip_binary = bin(int(ip_obj))[2:].zfill(max_prefixlen)
        encoded_ip = [int(bit) for bit in ip_binary]
        encoded_ips.append(encoded_ip)

    encoded_df = pd.DataFrame(encoded_ips)
    return encoded_df

# Preprocess the user agent column
def preprocess_user_agent(df, user_agent_column):
    df['device_type'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[0])
    df['operating_system'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[1])
    df['browser'] = df[user_agent_column].apply(lambda x: x.split('(')[1].split(';')[2].split(')')[0] if len(x.split('(')) > 1 and len(x.split(';')) > 2 else '')
    df.drop(columns=[user_agent_column], inplace=True)
    return df

# Preprocess the timestamp column
def preprocess_timestamp(df, timestamp_column):
    df[timestamp_column] = pd.to_datetime(df[timestamp_column])
    df['hour'] = df[timestamp_column].dt.hour
    df['day_of_week'] = df[timestamp_column].dt.dayofweek
    df['month'] = df[timestamp_column].dt.month
    df.drop(columns=[timestamp_column], inplace=True)
    return df

df = preprocess_data(df)

# Prepare the features and target
features = df.drop(columns='login_successful')
target = df['login_successful']

# One-hot encode the categorical features
features_encoded = pd.get_dummies(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 1.0


In [7]:
import pandas as pd

def preprocess_timestamp(df, timestamp_column):
    # Convert timestamp column to datetime format
    df[timestamp_column] = pd.to_datetime(df[timestamp_column])

    # Extract relevant features from the timestamp
    df['hour'] = df[timestamp_column].dt.hour
    df['day_of_week'] = df[timestamp_column].dt.dayofweek
    df['month'] = df[timestamp_column].dt.month

    # Drop the original timestamp column
    df.drop(columns=[timestamp_column], inplace=True)

    return df

# Example usage
data = {
    'timestamp': ['2020-02-03 12:43:55.873', '2023-05-02 15:45:00', '2023-05-03 11:20:00'],
    'value': [10, 20, 15]
}
df = pd.DataFrame(data)

# Preprocess the timestamp column
df_preprocessed = preprocess_timestamp(df, 'timestamp')
print(df_preprocessed)


   value  hour  day_of_week  month
0     10    12            0      2
1     20    15            1      5
2     15    11            2      5


In [8]:
import numpy as np
import ipaddress

def encode_ip_addresses(ip_addresses):
    encoded_ips = []
    max_prefixlen = 0

    # Find the maximum prefix length among the IP addresses
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        max_prefixlen = max(max_prefixlen, ip_obj.max_prefixlen)

    # Perform one-hot encoding for each IP address
    for ip in ip_addresses:
        ip_obj = ipaddress.ip_address(ip)
        ip_binary = bin(int(ip_obj))[2:].zfill(max_prefixlen)
        encoded_ip = np.zeros(max_prefixlen)
        for i, bit in enumerate(ip_binary):
            encoded_ip[i] = int(bit)
        encoded_ips.append(encoded_ip)

    return encoded_ips

# Example usage
ip_addresses = ["192.168.0.1", "10.0.0.1", "172.16.0.1"]
encoded_ips = encode_ip_addresses(ip_addresses)
for encoded_ip in encoded_ips:
    print(encoded_ip)


[1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1.]
[1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1.]


In [24]:
import pandas as pd

# Sample dataset
data = {
    'username': ['user1', 'user2', 'user3', 'user1', 'user2', 'user3'],
    'feature1': [1, 2, 3, 4, 5, 6],
    'feature2': [7, 8, 9, 10, 11, 12],
    'login_successful': [True, False, True, True, False, True]
}

df = pd.DataFrame(data)

# Perform one-hot encoding for the username feature
df_encoded = pd.get_dummies(df['username'], prefix='username')

# Concatenate the encoded username columns with the original dataframe
df = pd.concat([df.drop(columns='username'), df_encoded], axis=1)

print(df)

   feature1  feature2  login_successful  username_user1  username_user2  \
0         1         7              True               1               0   
1         2         8             False               0               1   
2         3         9              True               0               0   
3         4        10              True               1               0   
4         5        11             False               0               1   
5         6        12              True               0               0   

   username_user3  
0               0  
1               0  
2               1  
3               0  
4               0  
5               1  
