In [69]:
import json
import numpy as np
import pandas as pd
from training.data.datasets import load_json_data

In [70]:
# load data
with open('../../data/raw_data_annotated_with_chatgpt.json') as f:
    raw_data = json.load(f)

In [71]:
# example
raw_data[0]

{'text': {'text': 'Is there a case for a basic income guarantee – Part 5\nThis is Part 5 in the mini-series discussing the relative merits of the basic income guarantee proposal and the Job Guarantee proposal. It finishes this part of our discussion. Today, I consider how society establishes a fair transition environment to cope with climate change and the impacts of computerisation etc.',
  'article_id': 1520,
  'sentence_id': 0,
  'title': 'Is there a case for a basic income guarantee – Part 5'},
 'label': 'universal_basic_income',
 'label2': 'universal_basic_income',
 'spans': [],
 'article_id': 1520,
 'sentence_id': 0,
 'answer': '',
 'priority': None,
 'score': None,
 'title': 'Is there a case for a basic income guarantee – Part 5',
 'gpt_first_response_is_invalid': False,
 'gpt_second_response_is_invalid': False}

In [72]:
# create a dataframe from the data
df = pd.DataFrame(raw_data)
print(df.shape)
df.head()

(14369, 12)


Unnamed: 0,text,label,label2,spans,article_id,sentence_id,answer,priority,score,title,gpt_first_response_is_invalid,gpt_second_response_is_invalid
0,{'text': 'Is there a case for a basic income g...,universal_basic_income,universal_basic_income,[],1520,0,,,,Is there a case for a basic income guarantee –...,False,False
1,{'text': 'I outline a coherent adjustment fram...,universal_basic_income,universal_basic_income,[],1520,1,,,,Is there a case for a basic income guarantee –...,False,False
2,{'text': 'Adopting a basic income guarantee in...,universal_basic_income,universal_basic_income,[],1520,2,,,,Is there a case for a basic income guarantee –...,False,False
3,{'text': 'Just Transition Framework The Just T...,universal_basic_income,universal_basic_income,[],1520,3,,,,Is there a case for a basic income guarantee –...,False,False
4,{'text': '][Reference: (1996) ‘Sustainable dev...,universal_basic_income,universal_basic_income,[],1520,4,,,,Is there a case for a basic income guarantee –...,False,False


In [73]:
# count rows where "gpt_first_response_is_invalid" is True
df["gpt_first_response_is_invalid"].value_counts()

gpt_first_response_is_invalid
False    12532
True      1837
Name: count, dtype: int64

In [74]:
df["gpt_second_response_is_invalid"].value_counts()

gpt_second_response_is_invalid
False    13602
True       767
Name: count, dtype: int64

In [75]:
# drop all rows where "gpt_second_response_is_invalid" is True
df = df[df["gpt_second_response_is_invalid"] == False]
# drop label column
df = df.drop(columns=["label"])
df.rename(columns={"label2": "label"}, inplace=True)

In [76]:
# only select rows where label2 is not "unsure" or "irrelevant"
df = df[df["label"] != "unsure"]
df = df[df["label"] != "irrelevant"]
print(df.shape)

(9453, 11)


In [77]:
# load old data
old_data = load_json_data("../../training/datasets")
old_data.head()

Unnamed: 0,text,label,spans,article_id,sentence_id
0,When the fashion trio threeASFOUR debuted its...,3d_printed_clothes,"[fashion, 3D, printed]",0,0
1,When the fashion trio threeASFOUR debuted its ...,3d_printed_clothes,"[fashion, 3D, printed]",0,1
2,The trio at its helm — Gabi Asfour and his des...,3d_printed_clothes,"[3D, textiles]",0,7
3,They wanted to do the opposite: stretch clothi...,3d_printed_clothes,"[3D, textiles]",0,8
4,They dreamed of 3D-printing textiles that were...,3d_printed_clothes,"[3D, textiles]",0,9


In [78]:
df = df[old_data.columns]
df.text = df.text.apply(lambda x: x["text"])
df.head()

Unnamed: 0,text,label,spans,article_id,sentence_id
0,Is there a case for a basic income guarantee –...,universal_basic_income,[],1520,0
1,I outline a coherent adjustment framework to a...,universal_basic_income,[],1520,1
2,Adopting a basic income guarantee in this cont...,universal_basic_income,[],1520,2
3,Just Transition Framework\nThe Just Transition...,universal_basic_income,[],1520,3
4,][Reference: (1996) ‘Sustainable development: ...,universal_basic_income,[],1520,4


In [79]:
# make sure column names are the same
column_map = {
    # old -> new
    "3d_printed_clothes": "3d_printed_apparel",
    "car_sharing": "car-sharing",
    "e_health": "e-health",
    "smart_food_management_kitchen_fridges_freezers": "smart_food_management/kitchen/fridges/freezers",
}
old_data.label = old_data.label.apply(lambda x: column_map[x] if x in column_map else x)

In [80]:
# merge datasets
df = pd.concat([df, old_data], ignore_index=True)
print(df.shape)

(18869, 5)


In [81]:
# drop ay rows with duplicate texts
df = df.drop_duplicates(subset=["text"])
print(df.shape)

(18788, 5)


In [82]:
# count samples per label
df.label.value_counts()

label
e-health                     1506
sustainable_fabrics          1399
scope_4/avoided_emissions    1358
autonomous_transport         1043
ecosophy                      955
                             ... 
virtual_reality                 3
sustainable_washing             2
train                           1
teleworking                     1
videoconferencing               1
Name: count, Length: 67, dtype: int64

In [83]:
# drop all rows where label has less than 3 samples
df = df.groupby("label").filter(lambda x: len(x) >= 10)

In [84]:
# save data as parquet
df.to_parquet("../../training/datasets/chat_gpt_annotated_data.parquet")
print(len(set(df.label)))

57
