# Transcripts

In [11]:
import pandas as pd

at = pd.read_csv('./data/npi/transcripts/all_transcripts.csv').dropna(subset='q_a')[['index','q_a']].rename(columns={'index':'org_idx'})
t_ref = pd.read_csv('./data/npi/transcripts/t_reference.csv')[['filename']]

In [12]:
at

Unnamed: 0,org_idx,q_a
0,0,QUESTION:\nSIR BRIAN LANGSTAFF:ANSWER:\nA.
1,0,QUESTION:\nQ.ANSWER:\nA.
2,0,"QUESTION:\n Mrs D, on 19 May 1986 you went int..."
3,0,QUESTION:\n You were admitted into hospital?\n...
4,0,QUESTION:\n And on 23 May you were told you ha...
...,...,...
53851,399,"QUESTION:\n Just so everybody is aware, JPAC..."
53852,399,"QUESTION:\n Dr Cave, if I could ask the same..."
53853,399,QUESTION:\n I turn to those who are represen...
53854,399,"QUESTION:\n Dr Cave, if I could turn to you,..."


In [13]:
t_ref

Unnamed: 0,filename
0,2019-05-10 TS_Mrs D [W1921].pdf
1,2019-05-06 TS_Michelle Baker [W1825].pdf
2,2020-02-25 TS_Psychosocial Experts (London).pdf
3,2019-05-01 TS_Carolyn Challis [W0622].pdf
4,2019-05-01 TS_Kate Ashton [W1416].pdf
...,...
398,2022-11-15 TS_Dr Susan Hopkins and Professor C...
399,"2022-11-16 TS_Professor James Neuberger, Profe..."
400,2023-01-17 TS_Oral Submissions Collins & Andre...
401,2023-01-17 TS_Steven Snowden KC & Andrew Bragg...


In [19]:
at['filename'] = at['org_idx'].apply(lambda x: t_ref.loc[x].filename)

In [40]:
from datetime import datetime
at['date'] =  at['filename'].apply(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))

In [267]:
from whoosh.fields import Schema, TEXT, DATETIME
from whoosh.analysis import StandardAnalyzer

schema = Schema(filename=TEXT(stored=True, sortable=True), title=TEXT(stored=True, sortable=True), date=DATETIME(stored=True, sortable=True), text=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)))

In [268]:
import os.path
from whoosh.index import create_in

if not os.path.exists("indices/transcript_index"):
    os.mkdir("indices/transcript_index")
ix = create_in("indices/transcript_index", schema)

In [93]:
from whoosh.index import open_dir

ix = open_dir("indices/transcript_index")

In [269]:
writer = ix.writer()
for i in range(len(at)):
    writer.add_document(
        filename = at.iloc[i].filename,
        title = ' '.join(at.iloc[i].filename.split(' ')[1:]),
        text = at.iloc[i].q_a,
        date = at.iloc[i].date.to_pydatetime() # need to_pydatetime
    )
writer.commit()

In [270]:
from whoosh.qparser import QueryParser
query_str = 'AIDs'
parser = QueryParser("text", ix.schema)
query = parser.parse(query_str)

with ix.searcher() as searcher:
    results = searcher.search(query)
    print(results[0])

<Hit {'date': datetime.datetime(2022, 1, 25, 0, 0), 'filename': '2022-01-25 TS_Dr Bill Wagstaff.pdf', 'text': "QUESTION:\n I'm moving on now to AIDS, the issue of AIDS, and I 'm \ngoing to ask you, first of all, a handful of questi ons \nabout your knowledge of risk of AIDS. \nYou tell us in your statement that you became \naware of AIDS as a result of the report on three \npeople with haemophilia contracting AIDS; is that \nright?\nANSWER:\n Yes.\n", 'title': 'TS_Dr Bill Wagstaff.pdf'}>


# Written Statements

In [162]:
aw = pd.read_csv('./data/npi/written_statements/all_written_statements.csv').dropna(subset='answers')[['index','answers']].rename(columns={'index':'org_idx'}).reset_index(drop=True)
w_ref = pd.read_csv('./data/npi/written_statements/ws_reference.csv')[['fname']]

In [163]:
w_ref

Unnamed: 0,fname
0,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
1,2022-10-07 WITN7503001 WS-R_Tracey Carter (obo...
2,2020-04-24 W0797001 WS_[W0797].txt
3,2022-03-16 WITN6932013 WS-R_Professor Ludlam i...
4,2019-06-14 WITN1413002 WS_Stephen John Morrow.txt
...,...
3399,2019-11-12 WITN3849001 WS_[W3849] - 12 Nov 201...
3400,2022-08-12 WITN5610001 WS_Peter Michael Hughes...
3401,2002-09-29 WITN7152001 WS-1_Simon Tonkin - 29 ...
3402,2022-06-22 WITN7108001 WS_Ian Slaymaker - 22 J...


In [164]:
aw['filename'] = aw['org_idx'].apply(lambda x: w_ref.loc[x].fname)

In [165]:
aw

Unnamed: 0,org_idx,answers,filename
0,0,iirn;iria \n• • $ R \nI provide this statement...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
1,0,MFT had originally been set up by the Secreta...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
2,0,MFT did lobby DHSC for additional funding for...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
3,0,MFT did not undertake fundraising during my t...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
4,0,Due to the passage of time I cannot recall th...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
...,...,...,...
80973,3403,I.knew that i.GRoB]had suffered a bad acciden...,2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...
80974,3403,"GROBl,was taken to GROB ;(which no longer exi...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...
80975,3403,"Other Infections \n.As far as I am aware,GROB...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...
80976,3403,"5cm and one \nthat was 3cm, and I was able to ...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...


In [170]:
def parse_date_no_none(filename):
    date_str = filename.split(' ')[0]
    if date_str != 'None':
        return datetime.strptime(date_str, '%Y-%m-%d')

aw['date'] =  aw['filename'].apply(parse_date_no_none)

In [171]:
aw

Unnamed: 0,org_idx,answers,filename,date
0,0,iirn;iria \n• • $ R \nI provide this statement...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
1,0,MFT had originally been set up by the Secreta...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
2,0,MFT did lobby DHSC for additional funding for...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
3,0,MFT did not undertake fundraising during my t...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
4,0,Due to the passage of time I cannot recall th...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
...,...,...,...,...
80973,3403,I.knew that i.GRoB]had suffered a bad acciden...,2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...,2021-06-20
80974,3403,"GROBl,was taken to GROB ;(which no longer exi...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...,2021-06-20
80975,3403,"Other Infections \n.As far as I am aware,GROB...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...,2021-06-20
80976,3403,"5cm and one \nthat was 3cm, and I was able to ...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...,2021-06-20


In [271]:
if not os.path.exists("indices/written_statement_index"):
    os.mkdir("indices/written_statement_index")
ix = create_in("indices/written_statement_index", schema)

In [272]:
writer = ix.writer()
for i in range(len(aw)):
    writer.add_document(
        filename = aw.iloc[i].filename,
        title = ' '.join(aw.iloc[i].filename.split(' ')[1:]),
        text = aw.iloc[i].answers,
        date = aw.iloc[i].date.to_pydatetime() if not isinstance(aw.iloc[i].date, type(pd.NaT)) else None # need to_pydatetime
    )
writer.commit()

In [273]:
query_str = 'AIDs'
parser = QueryParser("text", ix.schema)
query = parser.parse(query_str)

with ix.searcher() as searcher:
    results = searcher.search(query)
    print(results[0])



# National Archive 

In [231]:
na = pd.read_csv('./data/national_archives/nat_archive_files.csv').reset_index(drop=True)[['filename', 'sentences']]
na

Unnamed: 0,filename,sentences
0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,2 References\nPinukz Ms Wilkinson PS / Perm Se...
1,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,0Opm on Tuesday 15 December at Lancaster House...
2,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,55 _ Delegates will be greeted individually as...
3,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope that all attending will find the symposiu...
4,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope it will also give an opportunity for Cent...
...,...,...
152933,Litigation and Compensation/Litigation and Com...,"users, to produce information leaflets that co..."
152934,Litigation and Compensation/Litigation and Com...,298858 very they they way TEL:\n(478 PRiMARY C...
152935,Litigation and Compensation/Litigation and Com...,"Primary Care Directorate; Ditton Ward, Preston..."
152936,Litigation and Compensation/Litigation and Com...,48 of men reporting injecting in the last five...


In [232]:
na['raw_fname'] = na['filename'].apply(lambda x: '_'.join(x.split('/')[1].split('_')[1:]).replace('.txt',''))

In [233]:
na

Unnamed: 0,filename,sentences,raw_fname
0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,2 References\nPinukz Ms Wilkinson PS / Perm Se...,JA 418-CHL-Z-1.pdf
1,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,0Opm on Tuesday 15 December at Lancaster House...,JA 418-CHL-Z-1.pdf
2,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,55 _ Delegates will be greeted individually as...,JA 418-CHL-Z-1.pdf
3,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope that all attending will find the symposiu...,JA 418-CHL-Z-1.pdf
4,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope it will also give an opportunity for Cent...,JA 418-CHL-Z-1.pdf
...,...,...,...
152933,Litigation and Compensation/Litigation and Com...,"users, to produce information leaflets that co...",JA 418_CDR_Z_1.pdf
152934,Litigation and Compensation/Litigation and Com...,298858 very they they way TEL:\n(478 PRiMARY C...,JA 418_CDR_Z_1.pdf
152935,Litigation and Compensation/Litigation and Com...,"Primary Care Directorate; Ditton Ward, Preston...",JA 418_CDR_Z_1.pdf
152936,Litigation and Compensation/Litigation and Com...,48 of men reporting injecting in the last five...,JA 418_CDR_Z_1.pdf


In [237]:
DFs = []
for fname in os.listdir('./data/national_archives/'):
    if fname[0].isupper():
        df = pd.read_csv(f'./data/national_archives/{fname}').reset_index()
        df['match_fname'] = df.real_fname.apply(lambda x: x.split('- ')[1].split('/')[1])
        DFs.append(df)
tot = pd.concat(DFs).reset_index()[['date', 'match_fname']]

In [238]:
tot

Unnamed: 0,date,match_fname
0,1987-04-25,JA 418_6W_Z_1.pdf
1,1992-12-14,JA 418-CHL-Z-1.pdf
2,1993-12-20,JA 418-WB-Z.pdf
3,1989-05-08,JA 418_4R_Z_1.pdf
4,1986-01-08,JA 418_CCN_Z.pdf
...,...,...
109,1996-11-15,JA 418_7S_Z_1.pdf
110,1996-11-28,JA 418_CBK_Z_1.pdf
111,1997-03-11,JA 418-CLK-Z.pdf
112,1997-05-08,JA 418-CH5-Z-1.pdf


In [241]:
na.raw_fname.iloc[0]

'JA 418-CHL-Z-1.pdf'

In [245]:
tot.loc[tot.match_fname == 'JA 418-CHL-Z-1.pdf'].iloc[0].date

'1992-12-14'

In [249]:
na['date'] = na.raw_fname.apply(lambda x: datetime.strptime(tot.loc[tot.match_fname==x].iloc[0].date.strip(), '%Y-%m-%d'))

In [251]:
na

Unnamed: 0,filename,sentences,raw_fname,date
0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,2 References\nPinukz Ms Wilkinson PS / Perm Se...,JA 418-CHL-Z-1.pdf,1992-12-14
1,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,0Opm on Tuesday 15 December at Lancaster House...,JA 418-CHL-Z-1.pdf,1992-12-14
2,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,55 _ Delegates will be greeted individually as...,JA 418-CHL-Z-1.pdf,1992-12-14
3,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope that all attending will find the symposiu...,JA 418-CHL-Z-1.pdf,1992-12-14
4,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope it will also give an opportunity for Cent...,JA 418-CHL-Z-1.pdf,1992-12-14
...,...,...,...,...
152933,Litigation and Compensation/Litigation and Com...,"users, to produce information leaflets that co...",JA 418_CDR_Z_1.pdf,1995-12-15
152934,Litigation and Compensation/Litigation and Com...,298858 very they they way TEL:\n(478 PRiMARY C...,JA 418_CDR_Z_1.pdf,1995-12-15
152935,Litigation and Compensation/Litigation and Com...,"Primary Care Directorate; Ditton Ward, Preston...",JA 418_CDR_Z_1.pdf,1995-12-15
152936,Litigation and Compensation/Litigation and Com...,48 of men reporting injecting in the last five...,JA 418_CDR_Z_1.pdf,1995-12-15


In [274]:
if not os.path.exists("indices/national_archive_index"):
    os.mkdir("indices/national_archive_index")
ix = create_in("indices/national_archive_index", schema)

In [275]:
writer = ix.writer()
for i in range(len(na)):
    writer.add_document(
        filename = na.iloc[i].filename,
        title = ' '.join(na.iloc[i].filename.split('_')[1:]).replace('.txt',''),
        text = na.iloc[i].sentences,
        date = na.iloc[i].date.to_pydatetime() if not isinstance(na.iloc[i].date, type(pd.NaT)) else None # need to_pydatetime
    )
writer.commit()

In [276]:
query_str = 'AIDs'
parser = QueryParser("text", ix.schema)
query = parser.parse(query_str)

with ix.searcher() as searcher:
    results = searcher.search_page(query, 1, 10)
    print(len(results))
    print(len(results.results))
    print(results.results[0])

5379
5379
<Hit {'date': datetime.datetime(1991, 7, 3, 0, 0), 'filename': 'Hep_C/Hep C_JA 418-WJ-Z-1.pdf.txt', 'text': '(ua Lo_d DR SUSAN LADER AIDS Unit Rm 211 FRH Ext 23220\nINFORMATION FOR DONORS About AIDS AIDS AIDS is caused by a virus that lives in blood: You can AIDS by  having sex with someone who has the AIDS virus.', 'title': 'C/Hep C JA 418-WJ-Z-1.pdf'}>
