In [2]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Load your dataset
df = pd.read_csv('/content/captcha_interaction_dataset.csv')


In [3]:
df.head()

Unnamed: 0,Time_Taken,Typing_Speed,Mouse_Movement,User_Type,Unnamed: 4
0,25.309908,3.758782,220,0,
1,25.21119,2.996186,184,0,
2,19.468682,2.127484,223,0,
3,20.736131,3.076847,131,0,
4,10.459342,2.427032,135,0,


In [9]:
df = df.drop(columns=['Unnamed: 4'])


In [10]:
df.columns

Index(['Time_Taken', 'Typing_Speed', 'Mouse_Movement', 'User_Type'], dtype='object')

In [11]:
scaler = MinMaxScaler()

In [12]:
df[['Time_Taken', 'Typing_Speed', 'Mouse_Movement', 'User_Type']] = scaler.fit_transform(df[['Time_Taken', 'Typing_Speed', 'Mouse_Movement', 'User_Type']])

In [8]:
df

Unnamed: 0,Time_Taken,Typing_Speed,Mouse_Movement,User_Type,Unnamed: 4
0,0.832473,0.135261,0.869565,0.0,
1,0.828946,0.076590,0.713043,0.0,
2,0.623792,0.009755,0.882609,0.0,
3,0.669072,0.082795,0.482609,0.0,
4,0.301928,0.032801,0.500000,0.0,
...,...,...,...,...,...
995,0.008269,0.793561,0.034783,1.0,
996,0.041535,0.806150,0.034783,1.0,
997,0.078538,0.885879,0.060870,1.0,
998,0.026309,0.774435,0.065217,1.0,


#### A. Random Noise in Mouse Movement
You can slightly alter the Mouse_Movement column by adding random noise to simulate different user interactions.

In [16]:
def augment_mouse_movement(df):
    augmented_df = df.copy()

    # Add noise to Mouse Movement (jittering)
    noise_factor = 0.05  # Adjust this based on how much noise you want
    augmented_df['Mouse_Movement'] += np.random.uniform(-noise_factor, noise_factor, size=len(df))

    # Clip values to ensure they remain within the range [0, 1]
    augmented_df['Mouse_Movement'] = np.clip(augmented_df['Mouse_Movement'], 0, 1)

    return augmented_df


#### B. Random Changes in Typing Speed
You can also add variations to the Typing_Speed by slightly changing the value.

In [17]:
def augment_typing_speed(df):
    augmented_df = df.copy()

    # Add random variation to Typing Speed
    augmented_df['Typing_Speed'] += np.random.uniform(-0.1, 0.1, size=len(df))  # +/-10% variation

    # Clip values to ensure they stay in the valid range
    augmented_df['Typing_Speed'] = np.clip(augmented_df['Typing_Speed'], 0, 1)

    return augmented_df


#### C. Random Changes in Time Taken
Introduce slight variations in the Time_Taken column to simulate real delays.

In [18]:
def augment_time_taken(df):
    augmented_df = df.copy()

    # Add random variation to Time Taken (within +/-10% range)
    augmented_df['Time_Taken'] += np.random.uniform(-0.1, 0.1, size=len(df))

    # Clip values to ensure they stay in the valid range
    augmented_df['Time_Taken'] = np.clip(augmented_df['Time_Taken'], 0, 1)

    return augmented_df


#### D. Simulate Random Human-like Hesitations
Humans tend to pause or hesitate while interacting. You can simulate this by adding random delays.

In [19]:
def simulate_human_hesitation(df):
    augmented_df = df.copy()

    # Add random hesitation (delays in actions)
    augmented_df['Hesitation'] = np.random.uniform(0.1, 0.5, size=len(df))  # Random hesitation time

    return augmented_df


In [20]:
def generate_multiple_augmented_data(df, n=50):  # n = how many times to repeat/augment
    augmented_data = df.copy()

    for i in range(n-1):  # Already have 1 copy, so repeat n-1 times
        augmented_data = pd.concat([augmented_data, augment_mouse_movement(df), augment_typing_speed(df), augment_time_taken(df)], ignore_index=True)

    return augmented_data

# Generate 50k rows by augmenting 1000 original rows
augmented_df = generate_multiple_augmented_data(df, 50)

# Check the size of the new augmented data
augmented_df.shape  # Should be around 50,000 rows


(148000, 4)

In [21]:
df

Unnamed: 0,Time_Taken,Typing_Speed,Mouse_Movement,User_Type
0,0.832473,0.135261,0.869565,0.0
1,0.828946,0.076590,0.713043,0.0
2,0.623792,0.009755,0.882609,0.0
3,0.669072,0.082795,0.482609,0.0
4,0.301928,0.032801,0.500000,0.0
...,...,...,...,...
995,0.008269,0.793561,0.034783,1.0
996,0.041535,0.806150,0.034783,1.0
997,0.078538,0.885879,0.060870,1.0
998,0.026309,0.774435,0.065217,1.0


In [23]:
augmented_df

Unnamed: 0,Time_Taken,Typing_Speed,Mouse_Movement,User_Type
0,0.832473,0.135261,0.869565,0.0
1,0.828946,0.076590,0.713043,0.0
2,0.623792,0.009755,0.882609,0.0
3,0.669072,0.082795,0.482609,0.0
4,0.301928,0.032801,0.500000,0.0
...,...,...,...,...
147995,0.000000,0.793561,0.034783,1.0
147996,0.014187,0.806150,0.034783,1.0
147997,0.000000,0.885879,0.060870,1.0
147998,0.000000,0.774435,0.065217,1.0



### What is Feature Engineering?
**Feature engineering** refers to the process of selecting, modifying, or creating new features (or columns) from your raw data to improve the performance of machine learning models. This process often involves:
1. **Selecting relevant features** that help the model perform well.
2. **Creating new features** from existing data based on domain knowledge or intuition.
3. **Transforming features** to improve their usefulness (e.g., normalizing, scaling, or adding new calculated columns).

In your case, adding a **'Delay'** column based on typing speed, mouse movement, and user type can provide additional information that helps the model identify human-like interactions versus bot behavior.

### Adding the 'Delay' Column (Feature Engineering Example)

Let’s go step by step to add this **'Delay'** feature based on the following logic:
- For **humans**, you can assume more delay if the typing speed is slower and mouse movement is more erratic.
- For **bots**, the delay would be less as their actions are often quicker and more mechanical.

Here’s how to implement this in code:

### Step 1: Define the Logic for Delay
We’ll add a **'Delay'** column where:
- **For humans (User_Type = 0)**: If **typing speed** is slow and **mouse movement** is large, the **delay** will be longer.
- **For bots (User_Type = 1)**: The delay will be smaller regardless of the other features.

### Explanation:
- **Human Delay**: For humans, we add more delay when:
  - Typing speed is **low** (`1 - Typing_Speed` makes slow typing speed have higher values).
  - Mouse movement is **more erratic** (large values).
  
- **Bot Delay**: For bots, we add less delay when:
  - Typing speed is **high** (small value of `1 - Typing_Speed`).
  - Mouse movement is **small**.

The final formula for **delay** is a combination of these features, with human delays being larger and bot delays smaller. The coefficients (e.g., `0.2` and `0.1`) can be adjusted based on the impact of each feature.

---

### Step 3: Use the New 'Delay' Feature in Your Model

After adding the **'Delay'** feature, you can now use it as an additional input for your machine learning model. It might help distinguish between human and bot behavior more effectively, especially since bots often exhibit more predictable and faster behavior compared to humans.

---

### Summary:
**Feature engineering** is crucial in improving machine learning models by making the data more informative. In your case, the **'Delay'** column provides new insights into the interaction patterns of users, which can help improve the prediction of human vs. bot behavior.


In [39]:
import pandas as pd
import numpy as np

def add_delay_feature(df):
    augmented_df = df.copy()

    # Ensure User_Type is numeric (0 for human, 1 for bot) for proper condition matching
    if augmented_df['User_Type'].dtype == 'O':  # 'O' means object type (string)
        augmented_df['User_Type'] = augmented_df['User_Type'].replace({'human': 0, 'bot': 1}).astype(int)

    # Create a new 'Delay' column without initializing it to 0
    augmented_df['Delay'] = np.nan  # Use NaN to avoid overwriting

    # Assign Delay values based on User_Type
    augmented_df.loc[augmented_df['User_Type'] == 0, 'Delay'] = (
        0.2 * (1 - augmented_df['Typing_Speed']) + 0.1 * augmented_df['Mouse_Movement']
    )

    augmented_df.loc[augmented_df['User_Type'] == 1, 'Delay'] = (
        0.1 * (1 - augmented_df['Typing_Speed']) + 0.05 * augmented_df['Mouse_Movement']
    )

    return augmented_df


In [40]:
# Call the function to add the 'Delay' feature
augmented_df = add_delay_feature(augmented_df)


  augmented_df['User_Type'] = augmented_df['User_Type'].replace({'human': 0, 'bot': 1}).astype(int)


Modify the Target Column (0/1 to 'bot'/'human')

In [41]:
# Replace target column values (0 -> 'human', 1 -> 'bot')
augmented_df['User_Type'] = augmented_df['User_Type'].replace({0: 'human', 1: 'bot'})


In [45]:
augmented_df

Unnamed: 0,Time_Taken,Typing_Speed,Mouse_Movement,User_Type,Delay
0,0.832473,0.135261,0.869565,human,0.259904
1,0.828946,0.076590,0.713043,human,0.255986
2,0.623792,0.009755,0.882609,human,0.286310
3,0.669072,0.082795,0.482609,human,0.231702
4,0.301928,0.032801,0.500000,human,0.243440
...,...,...,...,...,...
147995,0.000000,0.793561,0.034783,bot,0.022383
147996,0.014187,0.806150,0.034783,bot,0.021124
147997,0.000000,0.885879,0.060870,bot,0.014456
147998,0.000000,0.774435,0.065217,bot,0.025817


In [44]:
augmented_df.sample(10)

Unnamed: 0,Time_Taken,Typing_Speed,Mouse_Movement,User_Type,Delay
21680,0.036633,0.91607,0.126087,bot,0.014697
40103,0.482595,0.063392,0.896326,human,0.276954
31686,0.100645,0.93199,0.020688,bot,0.007835
54849,0.119403,0.977533,0.095652,human,0.014059
62083,0.897101,0.170902,0.408696,human,0.206689
132175,0.828358,0.069714,0.773913,human,0.263449
13614,0.062711,0.649523,0.100467,bot,0.040071
68456,0.454521,0.003669,0.934783,human,0.292745
9805,0.146835,0.648246,0.017391,bot,0.036045
132077,0.489297,0.152792,0.782609,human,0.247702


In [47]:
augmented_df.to_csv('captcha_interactions_featured.csv', index=False)
print("Dataset saved successfully!")

Dataset saved successfully!
