In [39]:
import pandas as pd
import json
import os

# Path to your dataset folder
folder_path = "dataset"

# Initialize an empty DataFrame
combined1 = pd.DataFrame()

# Loop through the 6 JSON files
for i in range(1, 7):
    file_path = os.path.join(folder_path, f"gemini-data-gen-{i}.txt")
    
    # Load the JSON list
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Convert label to 1/0
    df['label'] = df['label'].str.lower().map({'actionable': 1, 'non-actionable': 0})
    
    # Append
    combined1 = pd.concat([combined1, df], ignore_index=True)



In [40]:
# Optional: Shuffle the data
combined1 = combined1.sample(frac=1).reset_index(drop=True)

In [41]:
combined_df.head()

In [42]:
combined1.tail()

Unnamed: 0,text,label
1356,"The old office building had a certain charm, I...",0
1357,Can you find me a simple recipe for egg curry?,1
1358,Purchase new stationery supplies for the kids ...,1
1359,"Alexa, start a 5-minute meditation session.",1
1360,"Prepare some hot pakoras, it's raining heavily...",1


In [43]:
combined1["label"].value_counts()

1    691
0    670
Name: label, dtype: int64

In [44]:
# --------- Load JSON file (label 1) ---------
with open("dataset/binary_undersample.json", "r") as f:
    data = json.load(f)

# Take all from train (1250) and first 750 from val to get 2000 examples
train_sents = data["train"]
val_sents = data["val"][:750]

combined_sents = train_sents + val_sents

# Convert to DataFrame
df_json = pd.DataFrame(combined_sents, columns=["text", "dummy_label"])
df_json["label"] = 1
df_json.drop("dummy_label", axis=1, inplace=True)


In [45]:
# --------- Load TXT file (label 0) ---------
with open("dataset/all_wiki_sents.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# First 2000 lines, strip whitespace
txt_sents = [line.strip() for line in lines[:2000]]

# Convert to DataFrame
df_txt = pd.DataFrame(txt_sents, columns=["text"])
df_txt["label"] = 0

In [46]:
# --------- Combine and shuffle ---------
combined2 = pd.concat([df_json, df_txt], ignore_index=True)
combined2 = combined2.sample(frac=1, random_state=42).reset_index(drop=True)


In [47]:
combined2.head()

Unnamed: 0,text,label
0,i wish to know your hobbies,1
1,career hitter with a 499,0
2,will you allow more spending on my visa,1
3,the enemy army was not destroyed,0
4,landaff lies fully within the connecticut rive...,0


In [48]:
combined2.shape

(4000, 2)

In [49]:
combined1.shape

(1361, 2)

In [50]:
combined2["label"].value_counts()

1    2000
0    2000
Name: label, dtype: int64

In [52]:
final_df = pd.concat([combined1, combined2], ignore_index=True)
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [53]:
final_df.shape

(5361, 2)

In [55]:
final_df.head()

Unnamed: 0,text,label
0,xml has two relevant concepts,0
1,go to the credit card site and check if my app...,1
2,Book a flight to Phoenix for the first week of...,1
3,this resulted in the ansiieee 1014 1987 specif...,0
4,1989 joined by amanda levete who arrives from ...,0


In [56]:
# Save to CSV
final_df.to_csv("final_dataset.csv", index=False)
