In [1]:
import numpy as np
import pandas as pd

In [2]:
# df = pd.read_csv('gpt_bio/gpt_bio_result.csv')
df = pd.read_csv('gpt_bio/gpt_bio_result4_new_prompt_full.csv')

In [3]:
df.head()

Unnamed: 0,id,bio,api_out
0,C001059,a Representative from California; born in Fres...,Time Period | Location\n--- | ---\n1952-1970 |...
1,S001158,"(Brother of Kenneth Salazar), a Representative...",Time Period | Location\n--- | ---\n1953-1971 |...
2,M001155,A Representative from Florida; born in Fort My...,Time Period | Location\n--- | ---\n1967-1985 |...
3,W000797,a Representative from Florida; born in Forest ...,Time Period | Location\n--- | ---\n1966-1980 |...
4,P000591,A Representative from Georgia; born in Lansing...,Time Period | Location\n--- | ---\n1954-1972 |...


In [4]:
# replacing the api output with manual checked correct ones
# notice there are only 100 obs that got manually checked 

df_corr = pd.read_csv('sample_gpt_bio_result4_new_prompt_checked.csv')

df_corr = df_corr[['id', 'correct']]
df = df.merge(df_corr, on='id', how='left')
df['api_out'] = df.apply(lambda row: row.api_out if row.correct is np.nan else row.correct, axis=1)

In [5]:
print(df.loc[42]['api_out']) # instance that prints out "Period" as "period"


Time period | Location
--- | ---
1940-1955 | Bremerton, Kitsap County, Wash.
1955-1959 | Bremerton, Wash.
1959-1963 | Seattle, Wash.
1963-1968 | Seattle, Wash.
1968-1976 | (staff, United States Senator Warren G. Magnuson of Washington)
1977-2013 | (Congress)


In [6]:
# reformatting the timeline table

timelines = df.api_out.copy()
for i in range(len(timelines)):
    
    timelines[i] = timelines[i].replace('Time Period | Location\n--- | ---\n', '')

    # handling exception
    if i == 42:
        timelines[i] = timelines[i].replace('Time period | Location\n--- | ---\n', '')

    timelines[i] = timelines[i].split('\n')
    for j in range(len(timelines[i])):
        timelines[i][j] = timelines[i][j].split(' | ')
    timelines[i] = sum(timelines[i], []) # flatten the list

In [7]:
# getting the longest timeline
num_cols = max([len(timeline) for timeline in timelines])

# iteratively generating the column names
# source: https://stackoverflow.com/questions/44976363/generating-column-names-iteratively-in-pandas
rng = range(1, int(num_cols/2)+1) # generating the range in half
new_cols = []
for i in rng:
    new_cols.append('time_' + str(i))
    new_cols.append('location_' + str(i))

# converting the data to dictionary
dict = {}
for i in range(num_cols):
    column_list = []
    for timeline in timelines:
        # print(timeline)
        if i >= len(timeline):
            column_list.append(np.nan)
        else:
            column_list.append(timeline[i])
    dict[new_cols[i]] = column_list

# dictionary to dataframe
df_timeline = pd.DataFrame.from_dict(dict)

In [8]:
df_timeline.head()

Unnamed: 0,time_1,location_1,time_2,location_2,time_3,location_3,time_4,location_4,time_5,location_5,...,time_10,location_10,time_11,location_11,time_12,location_12,time_13,location_13,time_14,location_14
0,1952-1970,"Fresno, Fresno County, Calif.",1970-1974,"Fresno, Calif.",1978-1994,(California state assembly),1994-2002,(California state senate),2005-present,(Congress),...,,,,,,,,,,
1,1953-1971,"Alamosa, Alamosa County, Colo.",1971-1973,(United States Army),1973-1976,(United States Army),1976-1981,"Alamosa, Colo.",1981-1999,(farmer; rancher; business owner),...,,,,,,,,,,
2,1967-1985,"Fort Myers, Lee County, Fla.",1985-1993,"Gainesville, Fla.",2000-2003,(Florida state house of representatives),2005-2013,(Congress),,,...,,,,,,,,,,
3,1966-1980,"Forest Hills, Queens County, N.Y.",1980-1984,"Dix Hills, N.Y.",1984-1988,"Gainesville, Fla.",1988-1990,"Gainesville, Fla.",1989-1992,"(staff, United States Representative Peter Deu...",...,,,,,,,,,,
4,1954-1972,"Lansing, Ingham County, Mich.",1972-1976,"Dearborn, Mich.",1976-1979,"Ann Arbor, Mich.",1997-2004,Georgia,2005-2017,(Congress),...,,,,,,,,,,


In [9]:
for i in rng:
    try:
        df_timeline[[f"start_{i}", f"end_{i}"]] = df_timeline[f"time_{i}"].str.split('-', expand=True)
    
    # some years are not in the xxxx-yyyy format but only contains a year xxxx
    except ValueError:
        print(i)
        df_timeline[f"start_{i}"] = df_timeline[f"time_{i}"]
        df_timeline[f"end_{i}"] = np.nan

# sorting the columns
new_order = []
for i in rng:
    new_order.append(f"start_{i}")
    new_order.append(f"end_{i}")
    new_order.append(f"location_{i}")

df_timeline = df_timeline[new_order]
df_timeline.head()

Unnamed: 0,start_1,end_1,location_1,start_2,end_2,location_2,start_3,end_3,location_3,start_4,...,location_11,start_12,end_12,location_12,start_13,end_13,location_13,start_14,end_14,location_14
0,1952,1970,"Fresno, Fresno County, Calif.",1970,1974,"Fresno, Calif.",1978,1994,(California state assembly),1994,...,,,,,,,,,,
1,1953,1971,"Alamosa, Alamosa County, Colo.",1971,1973,(United States Army),1973,1976,(United States Army),1976,...,,,,,,,,,,
2,1967,1985,"Fort Myers, Lee County, Fla.",1985,1993,"Gainesville, Fla.",2000,2003,(Florida state house of representatives),2005,...,,,,,,,,,,
3,1966,1980,"Forest Hills, Queens County, N.Y.",1980,1984,"Dix Hills, N.Y.",1984,1988,"Gainesville, Fla.",1988,...,,,,,,,,,,
4,1954,1972,"Lansing, Ingham County, Mich.",1972,1976,"Dearborn, Mich.",1976,1979,"Ann Arbor, Mich.",1997,...,,,,,,,,,,


In [10]:
df_result = pd.concat([df.id, df_timeline], axis = 1)

In [11]:
# df_result.to_csv('results/sample_timeline.csv', index=False)
df_result.to_csv('results/timeline_gpt4_new_prompt_full.csv', index=False)