In [18]:
import os
import json
import re 

def read_json_files(directory):
    json_data = []
    
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r') as json_file:
                data = json.load(json_file)
                data['is_scam'] = True if 'scammer' in filename else False
                json_data.append(data)
    
    return json_data

# Example usage:
directory_path = 'generated_conversations/'
all_json_data = read_json_files(directory_path)

def extract_substring(input_string):
    try:
        start_index = input_string.index('[')
        end_index = input_string.rindex('}')
        if start_index < end_index:
            return input_string[start_index:end_index + 1]
        else:
            return "Invalid string: '[' occurs after '}'"
    except ValueError:
        return "Invalid string: Missing '[' or '}'"


def clean_and_extract_json_arrays(data):
    # Remove all lines containing '### Day' followed by numbers and any other unrelated text
    cleaned_data = re.sub(r'### Day \d+.*```json', '', data, flags=re.DOTALL)
    cleaned_data = re.sub(r'```\s*$', '', cleaned_data, flags=re.MULTILINE)

    # Find all JSON arrays in the cleaned data
    json_arrays = re.findall(r'\[\s*{.*?}\s*]', cleaned_data, flags=re.DOTALL)
    
    # Parse each JSON array
    parsed_json = []
    for json_array in json_arrays:
        try:
            parsed_json.extend(json.loads(json_array))
        except json.JSONDecodeError as e:
            print(f"JSON decoding failed: {e}")

    return parsed_json


In [19]:

import random

# Possible values for each field
countries = ["US", "GB", "JP", "NL", "FR", "DE", "CA"]
regions = ["California", "New York", "England", "Tokyo", "North Holland", "le-de-France", "Ontario"]
cities = ["Los Angeles", "New York", "London", "Tokyo", "Amsterdam", "Paris", "Toronto"]
isps = ["Vodafone UK", "Psychz Networks", "Datacamp", "NordVPN", "Mullvad VPN", "Comcast Cable"]
organizations = ["Vodafone UK", "Psychz Networks", "Datacamp", "NordVPN", "Mullvad VPN", "Comcast Cable"]
timezones = ["Europe/London", "America/Los_Angeles", "Asia/Tokyo", "Europe/Amsterdam", "Europe/Paris"]
zip_codes = ["N/A", "90210", "10001", "W1A 1AA", "75001"]
latitude_range = (30.0, 60.0)
longitude_range = (-130.0, 30.0)

# Random IP address generator
def generate_random_ip():
    return '.'.join(str(random.randint(0, 255)) for _ in range(4))


def generate_probabilistic_value(probability):
    return 1 if random.random() < probability else 0


def generate_random_ip_info(risk_level='low'):
    fraud_score = random.randint(0, 70) if risk_level == 'low' else random.randint(30, 100)
    proxy_probability = 0.3 if risk_level == 'low' else 0.7
    vpn_probability = 0.3 if risk_level == 'low' else 0.7
    recent_abuse_probability = 0.2 if risk_level == 'low' else 0.8
    bot_status_probability = 0.2 if risk_level == 'low' else 0.8

    return {
        "fraud_score": fraud_score,
        "country_code": random.choice(countries),
        "region": random.choice(regions),
        "city": random.choice(cities),
        "ISP": random.choice(isps),
        "ASN": random.randint(1000, 60000),
        "organization": random.choice(organizations),
        "is_crawler": 0,
        "timezone": random.choice(timezones),
        "mobile": random.randint(0, 1),
        "host": generate_random_ip(),
        "proxy": generate_probabilistic_value(proxy_probability),
        "vpn": generate_probabilistic_value(vpn_probability),
        "tor": generate_probabilistic_value(0.05),  # Low probability for TOR usage
        "active_vpn": generate_probabilistic_value(vpn_probability),
        "active_tor": generate_probabilistic_value(0.05),  # Low probability for active TOR usage
        "recent_abuse": generate_probabilistic_value(recent_abuse_probability),
        "bot_status": generate_probabilistic_value(bot_status_probability),
        "zip_code": random.choice(zip_codes),
        "latitude": round(random.uniform(*latitude_range), 2),
        "longitude": round(random.uniform(*longitude_range), 2),
        "IP": generate_random_ip()
    }

In [20]:
results = []

for json_string in all_json_data:
    stuff = clean_and_extract_json_arrays(extract_substring(json_string['chat_history']) + ']')
    for msg in stuff:
        list_of_keys = set(msg.keys())
        for key in list_of_keys:
            if key not in ['name', 'timestamp', 'chat']:
                msg['chat'] = msg[key]
                del msg[key] 
    json_string['chat_history'] = stuff
    if stuff: 
        if 'ip_info' not in json_string['persona1_bio']:
            json_string['persona1_bio']['ip_info'] = generate_random_ip_info(risk_level='low')
            json_string['persona2_bio']['ip_info'] = generate_random_ip_info(risk_level='low')
            if json_string['is_scam']:
                json_string['persona2_bio']['ip_info'] = generate_random_ip_info(risk_level='high')
        results.append(json_string)

JSON decoding failed: Expecting property name enclosed in double quotes: line 96 column 3 (char 4409)
JSON decoding failed: Expecting ',' delimiter: line 30 column 69 (char 1315)
JSON decoding failed: Expecting value: line 103 column 5 (char 5412)
JSON decoding failed: Expecting ',' delimiter: line 16 column 5 (char 2706)
JSON decoding failed: Invalid control character at: line 40 column 138 (char 1810)
JSON decoding failed: Invalid control character at: line 90 column 128 (char 3868)
JSON decoding failed: Expecting ',' delimiter: line 19 column 5 (char 2795)
JSON decoding failed: Expecting value: line 38 column 13 (char 1649)
JSON decoding failed: Expecting ',' delimiter: line 110 column 6 (char 4655)
JSON decoding failed: Invalid control character at: line 168 column 30 (char 7245)
JSON decoding failed: Invalid control character at: line 16 column 182 (char 2821)
JSON decoding failed: Expecting value: line 52 column 5 (char 2473)
JSON decoding failed: Expecting ',' delimiter: line 10

In [21]:
def save_to_json_file(data, filename):
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

In [22]:
save_to_json_file(results, 'synthetic_full.json')