<a href="https://colab.research.google.com/github/scorzo/generate-dataset/blob/main/faker_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install faker


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import pandas as pd
import random
from faker import Faker

# Initialize Faker for synthetic date and issue ID generation
fake = Faker()

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Function to generate text using GPT-2
def generate_text(prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text[len(prompt):].strip()

# Categories and seeds for generating data
categories = {
    "Server Downtime": ["diagnosing server crash", "resolving server unresponsiveness"],
    "Database Connectivity Issues": ["fixing database connection errors", "optimizing slow database queries"],
    "SSL/TLS Certificate Problems": ["renewing SSL certificate", "solving SSL handshake failures"]
}

# Generate synthetic data
records = []
for _ in range(3000):
    category = random.choice(list(categories.keys()))
    seed = random.choice(categories[category])
    issue_summary = generate_text(f"Summarize a typical issue related to {seed}")
    steps = generate_text(f"Steps to reproduce the issue: {issue_summary}")
    resolution = generate_text(f"Propose a resolution for the issue: {issue_summary}")
    record = {
        "issue_id": fake.unique.uuid4(),
        "date_and_time": fake.date_time_this_year().strftime("%Y-%m-%d %H:%M:%S"),
        "category": category,
        "issue_summary": issue_summary,
        "steps_to_reproduce": steps,
        "resolution": resolution
    }
    records.append(record)

# Convert to DataFrame for easier handling
df = pd.DataFrame(records)

# Display the first few records
df.head()


In [None]:
# File path for the CSV file in Google Colab
csv_file_path = '/content/synthetic_technical_support_data.csv'

# Write the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

print(f"Data written to {csv_file_path}")
