In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('gpt_bio/gpt_bio_result.csv')

In [3]:
print(df.loc[81]['api_out'])

Time Period | Location
--- | ---
1957-1975 | Detroit, Wayne County, Mich.
1975-1976 | Exeter, N.H.
1976-1977 | Detroit, Wayne County, Mich.
1977-1980 | Byfield, Mass.
1980-1984 | Ithaca, N.Y.
1984-1987 | Washington, D.C.
1987-1989 | (private practice)
1989-1990 | Michigan
1991-1992 | Michigan state house of representatives
1999-2003 | Michigan state house of representatives
2001 | Detroit, Mich.
2003-2010 | Michigan state senate
2011-2013 | Washington, D.C.
2012 | Michigan
2014 | Michigan


In [4]:
# reformatting the timeline table

timelines = df.api_out.copy()
for i in range(len(timelines)):
    timelines[i] = timelines[i].replace('Time Period | Location\n--- | ---\n', '')
    timelines[i] = timelines[i].split('\n')
    for j in range(len(timelines[i])):
        timelines[i][j] = timelines[i][j].split(' | ')
    timelines[i] = sum(timelines[i], []) # flatten the list
timelines

0     [1959-1977, Oak Park, Cook County, Ill., 1977-...
1     [1982-2000, Boston, Suffolk County, Mass., 200...
2     [1961-1979, New Hartford, Oneida County, N.Y.,...
3     [1955-1974, Atlanta, Fulton County, Ga., 1974-...
4     [1969-1987, Reading, Penn., 1987-1991, Albrigh...
                            ...                        
95    [1959-1981, Howard, Centre County, Pa., 1981-1...
96    [1971-1989, Dublin, Ireland; Hartsdale, NY, 19...
97    [1965-1983, Grand Rapids, Kent County, Michiga...
98    [1963-1981, Raleigh, Wake County, N.C., 1981-1...
99    [1947-1969, Marcus Hook, Delaware County, Pa.,...
Name: api_out, Length: 100, dtype: object

In [5]:
# getting the longest timeline
num_cols = max([len(timeline) for timeline in timelines])

# iteratively generating the column names
# source: https://stackoverflow.com/questions/44976363/generating-column-names-iteratively-in-pandas
rng = range(1, int(num_cols/2)+1) # generating the range in half
new_cols = []
for i in rng:
    new_cols.append('time_' + str(i))
    new_cols.append('location_' + str(i))

# converting the data to dictionary
dict = {}
for i in range(num_cols):
    column_list = []
    for timeline in timelines:
        # print(timeline)
        if i >= len(timeline):
            column_list.append(np.nan)
        else:
            column_list.append(timeline[i])
    dict[new_cols[i]] = column_list

# dictionary to dataframe
df_timeline = pd.DataFrame.from_dict(dict)

In [6]:
df_timeline.head()

Unnamed: 0,time_1,location_1,time_2,location_2,time_3,location_3,time_4,location_4,time_5,location_5,...,time_11,location_11,time_12,location_12,time_13,location_13,time_14,location_14,time_15,location_15
0,1959-1977,"Oak Park, Cook County, Ill.",1977-1981,"Cambridge, Mass.",1981-1985,"New Haven, Conn.",1985-2021,Private practice and professor,2005-2008,"Cheshire, Conn.",...,,,,,,,,,,
1,1982-2000,"Boston, Suffolk County, Mass.",2000-2002,"Deep Springs, Calif.",2002-2004,"Chicago, Ill.",2004-2010,"Oxford, England",2010-2016,(working for US government),...,,,,,,,,,,
2,1961-1979,"New Hartford, Oneida County, N.Y.",1979-1983,"Hamilton Township, N.Y.",1983-1987,"Cincinnati, Ohio",1987-2011,(unknown),2011-2016,New York State Assembly,...,,,,,,,,,,
3,1955-1974,"Atlanta, Fulton County, Ga.",1974-1978,"Los Angeles, Calif.",1978-1984,"Medford, Mass.",1984-1988,"Atlanta, Ga.",1988-1992,Georgia state house of representatives,...,,,,,,,,,,
4,1969-1987,"Reading, Penn.",1987-1991,"Albright College, Reading, Penn.",1991-1993,University of Connecticut,1993-1999,Harvard University,1999-2006,(career),...,,,,,,,,,,


In [7]:
for i in rng:
    try:
        df_timeline[[f"start_{i}", f"end_{i}"]] = df_timeline[f"time_{i}"].str.split('-', expand=True)
    
    # some times are not in the xxxx-yyyy format but only contains a year xxxx
    except ValueError:
        df_timeline[f"start_{i}"] = df_timeline[f"time_{i}"]
        df_timeline[f"end_{i}"] = np.nan

# sorting the columns
new_order = []
for i in rng:
    new_order.append(f"start_{i}")
    new_order.append(f"end_{i}")
    new_order.append(f"location_{i}")

df_timeline = df_timeline[new_order]
df_timeline.head()

Unnamed: 0,start_1,end_1,location_1,start_2,end_2,location_2,start_3,end_3,location_3,start_4,...,location_12,start_13,end_13,location_13,start_14,end_14,location_14,start_15,end_15,location_15
0,1959,1977,"Oak Park, Cook County, Ill.",1977,1981,"Cambridge, Mass.",1981,1985,"New Haven, Conn.",1985,...,,,,,,,,,,
1,1982,2000,"Boston, Suffolk County, Mass.",2000,2002,"Deep Springs, Calif.",2002,2004,"Chicago, Ill.",2004,...,,,,,,,,,,
2,1961,1979,"New Hartford, Oneida County, N.Y.",1979,1983,"Hamilton Township, N.Y.",1983,1987,"Cincinnati, Ohio",1987,...,,,,,,,,,,
3,1955,1974,"Atlanta, Fulton County, Ga.",1974,1978,"Los Angeles, Calif.",1978,1984,"Medford, Mass.",1984,...,,,,,,,,,,
4,1969,1987,"Reading, Penn.",1987,1991,"Albright College, Reading, Penn.",1991,1993,University of Connecticut,1993,...,,,,,,,,,,


In [8]:
df_timeline.to_csv('results/sample_timeline.csv', index=False)