# Transcripts

In [5]:
import pandas as pd

at = pd.read_csv('./data/npi/transcripts/all_transcripts.csv').dropna(subset='q_a')[['index','q_a']].rename(columns={'index':'org_idx'})
t_ref = pd.read_csv('./data/npi/transcripts/t_reference.csv')[['filename']]

In [6]:
at

Unnamed: 0,org_idx,q_a
0,0,QUESTION:\nSIR BRIAN LANGSTAFF:ANSWER:\nA.
1,0,QUESTION:\nQ.ANSWER:\nA.
2,0,"QUESTION:\n Mrs D, on 19 May 1986 you went int..."
3,0,QUESTION:\n You were admitted into hospital?\n...
4,0,QUESTION:\n And on 23 May you were told you ha...
...,...,...
53851,399,"QUESTION:\n Just so everybody is aware, JPAC..."
53852,399,"QUESTION:\n Dr Cave, if I could ask the same..."
53853,399,QUESTION:\n I turn to those who are represen...
53854,399,"QUESTION:\n Dr Cave, if I could turn to you,..."


In [7]:
t_ref

Unnamed: 0,filename
0,2019-05-10 TS_Mrs D [W1921].pdf
1,2019-05-06 TS_Michelle Baker [W1825].pdf
2,2020-02-25 TS_Psychosocial Experts (London).pdf
3,2019-05-01 TS_Carolyn Challis [W0622].pdf
4,2019-05-01 TS_Kate Ashton [W1416].pdf
...,...
398,2022-11-15 TS_Dr Susan Hopkins and Professor C...
399,"2022-11-16 TS_Professor James Neuberger, Profe..."
400,2023-01-17 TS_Oral Submissions Collins & Andre...
401,2023-01-17 TS_Steven Snowden KC & Andrew Bragg...


In [8]:
at['filename'] = at['org_idx'].apply(lambda x: t_ref.loc[x].filename)

In [9]:
from datetime import datetime
at['date'] =  at['filename'].apply(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))

In [21]:
from whoosh.fields import Schema, TEXT, DATETIME
from whoosh.analysis import StandardAnalyzer

schema = Schema(filename=TEXT(stored=True, sortable=True), title=TEXT(stored=True, sortable=True), date=DATETIME(stored=True, sortable=True), text=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)), full_text=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)))

In [22]:
import re
at['answers'] = at['q_a'].apply(lambda x: re.split('ANSWER:', x)[-1])

In [23]:
at

Unnamed: 0,org_idx,q_a,filename,date,answers
0,0,QUESTION:\nSIR BRIAN LANGSTAFF:ANSWER:\nA.,2019-05-10 TS_Mrs D [W1921].pdf,2019-05-10,\nA.
1,0,QUESTION:\nQ.ANSWER:\nA.,2019-05-10 TS_Mrs D [W1921].pdf,2019-05-10,\nA.
2,0,"QUESTION:\n Mrs D, on 19 May 1986 you went int...",2019-05-10 TS_Mrs D [W1921].pdf,2019-05-10,"\n I did, yes.\n"
3,0,QUESTION:\n You were admitted into hospital?\n...,2019-05-10 TS_Mrs D [W1921].pdf,2019-05-10,\n Yes.\n
4,0,QUESTION:\n And on 23 May you were told you ha...,2019-05-10 TS_Mrs D [W1921].pdf,2019-05-10,"\n That's right, yes.\n"
...,...,...,...,...,...
53851,399,"QUESTION:\n Just so everybody is aware, JPAC...","2022-11-16 TS_Professor James Neuberger, Profe...",2022-11-16,"\n Yes, it's a group of -- variety of blood\..."
53852,399,"QUESTION:\n Dr Cave, if I could ask the same...","2022-11-16 TS_Professor James Neuberger, Profe...",2022-11-16,"\n No, our role is to implement the recommen..."
53853,399,QUESTION:\n I turn to those who are represen...,"2022-11-16 TS_Professor James Neuberger, Profe...",2022-11-16,"\n Much to my, I don't know, surprise or\nho..."
53854,399,"QUESTION:\n Dr Cave, if I could turn to you,...","2022-11-16 TS_Professor James Neuberger, Profe...",2022-11-16,"\n Other than echoing those comments, I woul..."


In [25]:
import os.path
from whoosh.index import create_in

if not os.path.exists("indices/transcript_answers_index"):
    os.mkdir("indices/transcript_answers_index")
ix = create_in("indices/transcript_answers_index", schema)

In [26]:
from whoosh.index import open_dir

ix = open_dir("indices/transcript_answers_index")

In [27]:
writer = ix.writer()
for i in range(len(at)):
    writer.add_document(
        filename = at.iloc[i].filename,
        title = ' '.join(at.iloc[i].filename.split(' ')[1:]),
        text = at.iloc[i].answers,
        full_text = at.iloc[i].q_a,
        date = at.iloc[i].date.to_pydatetime() # need to_pydatetime
    )
writer.commit()

In [28]:
from whoosh.qparser import QueryParser
query_str = 'AIDs'
parser = QueryParser("text", ix.schema)
query = parser.parse(query_str)

with ix.searcher() as searcher:
    results = searcher.search(query)
    print(results[0])

<Hit {'date': datetime.datetime(2019, 11, 1, 0, 0), 'filename': '2019-11-01 TS_Mary Grindley [W2336] only.pdf', 'full_text': "QUESTION:\n So we can see the date that it was was November 1 983, \nand your understanding, yours and John's, was this some \nform of AIDS reaction test.\nANSWER:\n Yes.  In his notes it doesn't say AIDS, but that -- we \ncalled it an AIDS test.26 \n", 'text': "\n Yes.  In his notes it doesn't say AIDS, but that -- we \ncalled it an AIDS test.26 \n", 'title': 'TS_Mary Grindley [W2336] only.pdf'}>


# Written Statements

In [162]:
aw = pd.read_csv('./data/npi/written_statements/all_written_statements.csv').dropna(subset='answers')[['index','answers']].rename(columns={'index':'org_idx'}).reset_index(drop=True)
w_ref = pd.read_csv('./data/npi/written_statements/ws_reference.csv')[['fname']]

In [163]:
w_ref

Unnamed: 0,fname
0,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
1,2022-10-07 WITN7503001 WS-R_Tracey Carter (obo...
2,2020-04-24 W0797001 WS_[W0797].txt
3,2022-03-16 WITN6932013 WS-R_Professor Ludlam i...
4,2019-06-14 WITN1413002 WS_Stephen John Morrow.txt
...,...
3399,2019-11-12 WITN3849001 WS_[W3849] - 12 Nov 201...
3400,2022-08-12 WITN5610001 WS_Peter Michael Hughes...
3401,2002-09-29 WITN7152001 WS-1_Simon Tonkin - 29 ...
3402,2022-06-22 WITN7108001 WS_Ian Slaymaker - 22 J...


In [164]:
aw['filename'] = aw['org_idx'].apply(lambda x: w_ref.loc[x].fname)

In [165]:
aw

Unnamed: 0,org_idx,answers,filename
0,0,iirn;iria \n• • $ R \nI provide this statement...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
1,0,MFT had originally been set up by the Secreta...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
2,0,MFT did lobby DHSC for additional funding for...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
3,0,MFT did not undertake fundraising during my t...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
4,0,Due to the passage of time I cannot recall th...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt
...,...,...,...
80973,3403,I.knew that i.GRoB]had suffered a bad acciden...,2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...
80974,3403,"GROBl,was taken to GROB ;(which no longer exi...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...
80975,3403,"Other Infections \n.As far as I am aware,GROB...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...
80976,3403,"5cm and one \nthat was 3cm, and I was able to ...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...


In [170]:
def parse_date_no_none(filename):
    date_str = filename.split(' ')[0]
    if date_str != 'None':
        return datetime.strptime(date_str, '%Y-%m-%d')

aw['date'] =  aw['filename'].apply(parse_date_no_none)

In [171]:
aw

Unnamed: 0,org_idx,answers,filename,date
0,0,iirn;iria \n• • $ R \nI provide this statement...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
1,0,MFT had originally been set up by the Secreta...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
2,0,MFT did lobby DHSC for additional funding for...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
3,0,MFT did not undertake fundraising during my t...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
4,0,Due to the passage of time I cannot recall th...,2021-02-18 WITN3108010 WS-3_Jan Barlow.txt,2021-02-18
...,...,...,...,...
80973,3403,I.knew that i.GRoB]had suffered a bad acciden...,2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...,2021-06-20
80974,3403,"GROBl,was taken to GROB ;(which no longer exi...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...,2021-06-20
80975,3403,"Other Infections \n.As far as I am aware,GROB...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...,2021-06-20
80976,3403,"5cm and one \nthat was 3cm, and I was able to ...",2021-06-20 WITN5050001 WS_[W5050] - 20 Jun 202...,2021-06-20


In [271]:
if not os.path.exists("indices/written_statement_index"):
    os.mkdir("indices/written_statement_index")
ix = create_in("indices/written_statement_index", schema)

In [272]:
writer = ix.writer()
for i in range(len(aw)):
    writer.add_document(
        filename = aw.iloc[i].filename,
        title = ' '.join(aw.iloc[i].filename.split(' ')[1:]),
        text = aw.iloc[i].answers,
        date = aw.iloc[i].date.to_pydatetime() if not isinstance(aw.iloc[i].date, type(pd.NaT)) else None # need to_pydatetime
    )
writer.commit()

In [273]:
query_str = 'AIDs'
parser = QueryParser("text", ix.schema)
query = parser.parse(query_str)

with ix.searcher() as searcher:
    results = searcher.search(query)
    print(results[0])



# National Archive 

In [231]:
na = pd.read_csv('./data/national_archives/nat_archive_files.csv').reset_index(drop=True)[['filename', 'sentences']]
na

Unnamed: 0,filename,sentences
0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,2 References\nPinukz Ms Wilkinson PS / Perm Se...
1,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,0Opm on Tuesday 15 December at Lancaster House...
2,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,55 _ Delegates will be greeted individually as...
3,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope that all attending will find the symposiu...
4,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope it will also give an opportunity for Cent...
...,...,...
152933,Litigation and Compensation/Litigation and Com...,"users, to produce information leaflets that co..."
152934,Litigation and Compensation/Litigation and Com...,298858 very they they way TEL:\n(478 PRiMARY C...
152935,Litigation and Compensation/Litigation and Com...,"Primary Care Directorate; Ditton Ward, Preston..."
152936,Litigation and Compensation/Litigation and Com...,48 of men reporting injecting in the last five...


In [232]:
na['raw_fname'] = na['filename'].apply(lambda x: '_'.join(x.split('/')[1].split('_')[1:]).replace('.txt',''))

In [233]:
na

Unnamed: 0,filename,sentences,raw_fname
0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,2 References\nPinukz Ms Wilkinson PS / Perm Se...,JA 418-CHL-Z-1.pdf
1,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,0Opm on Tuesday 15 December at Lancaster House...,JA 418-CHL-Z-1.pdf
2,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,55 _ Delegates will be greeted individually as...,JA 418-CHL-Z-1.pdf
3,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope that all attending will find the symposiu...,JA 418-CHL-Z-1.pdf
4,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope it will also give an opportunity for Cent...,JA 418-CHL-Z-1.pdf
...,...,...,...
152933,Litigation and Compensation/Litigation and Com...,"users, to produce information leaflets that co...",JA 418_CDR_Z_1.pdf
152934,Litigation and Compensation/Litigation and Com...,298858 very they they way TEL:\n(478 PRiMARY C...,JA 418_CDR_Z_1.pdf
152935,Litigation and Compensation/Litigation and Com...,"Primary Care Directorate; Ditton Ward, Preston...",JA 418_CDR_Z_1.pdf
152936,Litigation and Compensation/Litigation and Com...,48 of men reporting injecting in the last five...,JA 418_CDR_Z_1.pdf


In [237]:
DFs = []
for fname in os.listdir('./data/national_archives/'):
    if fname[0].isupper():
        df = pd.read_csv(f'./data/national_archives/{fname}').reset_index()
        df['match_fname'] = df.real_fname.apply(lambda x: x.split('- ')[1].split('/')[1])
        DFs.append(df)
tot = pd.concat(DFs).reset_index()[['date', 'match_fname']]

In [238]:
tot

Unnamed: 0,date,match_fname
0,1987-04-25,JA 418_6W_Z_1.pdf
1,1992-12-14,JA 418-CHL-Z-1.pdf
2,1993-12-20,JA 418-WB-Z.pdf
3,1989-05-08,JA 418_4R_Z_1.pdf
4,1986-01-08,JA 418_CCN_Z.pdf
...,...,...
109,1996-11-15,JA 418_7S_Z_1.pdf
110,1996-11-28,JA 418_CBK_Z_1.pdf
111,1997-03-11,JA 418-CLK-Z.pdf
112,1997-05-08,JA 418-CH5-Z-1.pdf


In [241]:
na.raw_fname.iloc[0]

'JA 418-CHL-Z-1.pdf'

In [245]:
tot.loc[tot.match_fname == 'JA 418-CHL-Z-1.pdf'].iloc[0].date

'1992-12-14'

In [249]:
na['date'] = na.raw_fname.apply(lambda x: datetime.strptime(tot.loc[tot.match_fname==x].iloc[0].date.strip(), '%Y-%m-%d'))

In [279]:
na['cat'] = na.filename.apply(lambda x: x.split('/')[0])

In [284]:
na

Unnamed: 0,filename,sentences,raw_fname,date,cat
0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,2 References\nPinukz Ms Wilkinson PS / Perm Se...,JA 418-CHL-Z-1.pdf,1992-12-14,HIV
1,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,0Opm on Tuesday 15 December at Lancaster House...,JA 418-CHL-Z-1.pdf,1992-12-14,HIV
2,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,55 _ Delegates will be greeted individually as...,JA 418-CHL-Z-1.pdf,1992-12-14,HIV
3,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope that all attending will find the symposiu...,JA 418-CHL-Z-1.pdf,1992-12-14,HIV
4,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope it will also give an opportunity for Cent...,JA 418-CHL-Z-1.pdf,1992-12-14,HIV
...,...,...,...,...,...
152933,Litigation and Compensation/Litigation and Com...,"users, to produce information leaflets that co...",JA 418_CDR_Z_1.pdf,1995-12-15,Litigation and Compensation
152934,Litigation and Compensation/Litigation and Com...,298858 very they they way TEL:\n(478 PRiMARY C...,JA 418_CDR_Z_1.pdf,1995-12-15,Litigation and Compensation
152935,Litigation and Compensation/Litigation and Com...,"Primary Care Directorate; Ditton Ward, Preston...",JA 418_CDR_Z_1.pdf,1995-12-15,Litigation and Compensation
152936,Litigation and Compensation/Litigation and Com...,48 of men reporting injecting in the last five...,JA 418_CDR_Z_1.pdf,1995-12-15,Litigation and Compensation


In [306]:
na.cat.unique()

array(['HIV', 'Haemophilia', 'Hep_C', 'Litigation and Compensation'],
      dtype=object)

In [287]:
schema = Schema(
    filename=TEXT(stored=True, sortable=True), 
    title=TEXT(stored=True, sortable=True), 
    date=DATETIME(stored=True, sortable=True), 
    category = TEXT(stored=True, sortable=True),
    text=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None))
    )

In [288]:
if not os.path.exists("indices/national_archive_index"):
    os.mkdir("indices/national_archive_index")
ix = create_in("indices/national_archive_index", schema)

In [289]:
writer = ix.writer()
for i in range(len(na)):
    writer.add_document(
        filename = na.iloc[i].filename,
        title = ' '.join(na.iloc[i].filename.split('_')[1:]).replace('.txt',''),
        text = na.iloc[i].sentences,
        category = na.iloc[i]['cat'],
        date = na.iloc[i].date.to_pydatetime() if not isinstance(na.iloc[i].date, type(pd.NaT)) else None # need to_pydatetime
    )
writer.commit()

In [294]:
from whoosh import sorting
cats = sorting.FieldFacet("category")

In [308]:
query_str = 'AIDs'
parser = QueryParser("text", ix.schema)
query = parser.parse(query_str)

with ix.searcher() as searcher:
    results = searcher.search_page(query, 1, groupedby={"category":cats})
    for hc in results.results.groups()['Hep_C']:
        print(searcher.stored_fields(hc))

{'category': 'Hep_C', 'date': datetime.datetime(1991, 7, 3, 0, 0), 'filename': 'Hep_C/Hep C_JA 418-WJ-Z-1.pdf.txt', 'text': '(ua Lo_d DR SUSAN LADER AIDS Unit Rm 211 FRH Ext 23220\nINFORMATION FOR DONORS About AIDS AIDS AIDS is caused by a virus that lives in blood: You can AIDS by  having sex with someone who has the AIDS virus.', 'title': 'C/Hep C JA 418-WJ-Z-1.pdf'}
{'category': 'Hep_C', 'date': datetime.datetime(1986, 1, 8, 0, 0), 'filename': 'Hep_C/Hep C_JA 418_CCN_Z.pdf.txt', 'text': 'velop AIDS-related conditions or AIDS itseli .', 'title': 'C/Hep C JA 418 CCN Z.pdf'}
{'category': 'Hep_C', 'date': datetime.datetime(1994, 11, 18, 0, 0), 'filename': 'Hep_C/Hep C_JA 418-CJF-Z-1.pdf.txt', 'text': 'AIDS by developed AIDS by this date (specificity 85%) .', 'title': 'C/Hep C JA 418-CJF-Z-1.pdf'}
{'category': 'Hep_C', 'date': datetime.datetime(1989, 5, 8, 0, 0), 'filename': 'Hep_C/Hep C_JA 418_4R_Z_1.pdf.txt', 'text': 'Recovery of AIDS-associated retroviruses from Int J Cancer 1984; 33.

# Adding context

In [61]:
from whoosh.index import open_dir

ix = open_dir('./indices/national_archive_index/')

In [6]:
query_str = 'stigma'
cat_choice = 'Hep_C'

In [63]:
import pandas as pd

nat_archive = pd.read_csv('./data/national_archives/nat_archive_files.csv')

In [64]:
nat_archive

Unnamed: 0.1,Unnamed: 0,filename,sentences
0,0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,2 References\nPinukz Ms Wilkinson PS / Perm Se...
1,0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,0Opm on Tuesday 15 December at Lancaster House...
2,0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,55 _ Delegates will be greeted individually as...
3,0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope that all attending will find the symposiu...
4,0,HIV/HIV AIDS_JA 418-CHL-Z-1.pdf.txt,Hope it will also give an opportunity for Cent...
...,...,...,...
152933,112,Litigation and Compensation/Litigation and Com...,"users, to produce information leaflets that co..."
152934,112,Litigation and Compensation/Litigation and Com...,298858 very they they way TEL:\n(478 PRiMARY C...
152935,112,Litigation and Compensation/Litigation and Com...,"Primary Care Directorate; Ditton Ward, Preston..."
152936,112,Litigation and Compensation/Litigation and Com...,48 of men reporting injecting in the last five...


In [29]:
pd.read_csv('./data/npi/transcripts/all_transcripts.csv')

Unnamed: 0.1,Unnamed: 0,index,q_a
0,0,0,QUESTION:\nSIR BRIAN LANGSTAFF:ANSWER:\nA.
1,1,0,QUESTION:\nQ.ANSWER:\nA.
2,2,0,"QUESTION:\n Mrs D, on 19 May 1986 you went int..."
3,3,0,QUESTION:\n You were admitted into hospital?\n...
4,4,0,QUESTION:\n And on 23 May you were told you ha...
...,...,...,...
53854,53854,399,"QUESTION:\n Dr Cave, if I could turn to you,..."
53855,53855,399,QUESTION:\n Professor Bellamy?\nANSWER:\n ...
53856,53856,400,
53857,53857,401,


In [30]:
pd.read_csv('./data/npi/written_statements/all_written_statements.csv')

Unnamed: 0.1,Unnamed: 0,index,answers
0,0,0,iirn;iria \n• • $ R \nI provide this statement...
1,1,0,MFT had originally been set up by the Secreta...
2,2,0,MFT did lobby DHSC for additional funding for...
3,3,0,MFT did not undertake fundraising during my t...
4,4,0,Due to the passage of time I cannot recall th...
...,...,...,...
81140,81140,3403,I.knew that i.GRoB]had suffered a bad acciden...
81141,81141,3403,"GROBl,was taken to GROB ;(which no longer exi..."
81142,81142,3403,"Other Infections \n.As far as I am aware,GROB..."
81143,81143,3403,"5cm and one \nthat was 3cm, and I was able to ..."


In [65]:
from whoosh.qparser import QueryParser
from whoosh import sorting

cats = sorting.FieldFacet("category")
parser = QueryParser("text", ix.schema)
query = parser.parse(query_str)
searches = [q.lower() for q in query_str.split(' ') if (q != 'AND') and (q != 'OR') and (q != 'NOT')]

with ix.searcher() as searcher:
    results = searcher.search_page(query, 1, 10//2, groupedby=cats) # sortedby=None works! to add later
    hits = results.results.groups()[cat_choice]
    for res in set(hits[0:10]):
        r = searcher.stored_fields(res)
        # print(r)
        res_idx = nat_archive.loc[(nat_archive.sentences==r['text']) & (nat_archive.filename==r['filename'])].index[0]
        print(list(nat_archive.iloc[res_idx-1:res_idx].sentences))
        # print(r['text'])
        print(list(nat_archive.iloc[res_idx:res_idx+2].sentences))
        print('\n')


['Peer  support was identified as important, given the sense of isolation experienced by many people.']
["The research has uncovered a disturbing catalogue of prejudice, stigma and discrimination facing many people with hepatitis C. The public's lack of awareness, the sense of isolation experienced by  many and   discrimination/stigma  facing  individuals, partners and families compound the social and health problems associated with hepatitis C good for , they they\nThe recommendations have been put together from direct requests made during the research, using examples of god practice at centres and through the interpretation of other feedback.", 'In some ways, the research has served as the mouthpiece of people living with hepatitis C. [ hope that the report does justice to the experiences which people have shared with me during the course of the  research and serves to raise   awareness  of hepatitis C and its  impact On individuals and others affected by it.']


['Aspects of hepatit

In [8]:
from whoosh.index import open_dir
from whoosh.qparser import QueryParser

ix = open_dir('./indices/transcript_index/')
parser = QueryParser("text", ix.schema)
query = parser.parse(query_str)

In [24]:
cats = None
with ix.searcher() as searcher:
    results = searcher.search(query, groupedby=cats, limit=None)
    # groups = results.groups()
    # hits = list(set(groups[cat_choice]))
    # results = list(results)
    print(type(results))
    for i, res in enumerate(results[20:30]):
        # r = searcher.stored_fields(res)
        print(i)
        print(res['text'])
        print('\n')

<class 'whoosh.searching.Results'>
0
QUESTION:
 You've spoken a couple of times about stigma that  you 
have faced from the medical and dental profession. 
ANSWER:
 Yes.



1
QUESTION:
 And on that point Lord Archer is saying, well, that
may be to do with the stigma.
ANSWER:
 Yes, exactly.  Yes, yes.



2
QUESTION:
   So the reasons people used the 
intermediary service, if I can put it like that, 
were for reasons of confidentiality to keep  16 
their identity private?
ANSWER:
 Absolutely, and that of their families, 
obviously, which is still around stigma and the 
fear of that stigma.  And also confidence. 
I think they felt that having an interview in 
their own homes generally, as Jackie explained, 
kept it, if you like, much more informal and an 
ability to have some control over that setting. 



3
QUESTION:
 And you have explained in your statement that was y our 
first sign of any real stigma and isolation from 
medical practitioners -- 
ANSWER:
 Yes. 



4
QUESTION:
 You talk

In [64]:
data = pd.read_csv('./data/all_written_statements.csv').rename(columns={'index':'doc_index', 'answers':'passage'})
# data = data.dropna(subset='passage')

test_find = """Stigma 
. Please explain, to the best of your ability, any stigma associatedwith 
Thalassaemla both In wider society and within certain communities. """
res_idx = int(data.loc[data.passage.str.contains(test_find.strip(), regex=False, na=False)].index[0])
res_idx

30619

In [68]:
data.iloc[res_idx].passage

' Stigma \n. Please explain, to the best of your ability, any stigma associatedwith \nThalassaemla both In wider society and within certain communities. \n.'

In [69]:
data.iloc[30618].passage

' All the information published is available on our website. We are also unable to \nprovide any additional information on this. \n'

In [70]:
data.iloc[30620].passage

' Thalassaemia major is an inherited severe blood disorder and we have been told \nthat in most communities, the diagnosis of the condition was hidden from relatives, \nfriends and the wider community. \n,'

# Save button

In [2]:
from fpdf import FPDF

pdf = FPDF()
pdf.multi_cell??

[0;31mSignature:[0m
[0mpdf[0m[0;34m.[0m[0mmulti_cell[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mw[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mh[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtxt[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mborder[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malign[0m[0;34m=[0m[0;34m<[0m[0mAlign[0m[0;34m.[0m[0mJ[0m[0;34m:[0m [0;34m'JUSTIFY'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfill[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplit_only[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlink[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mln[0m[0;34m=[0m[0;34m'DEPRECATED'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_line_height[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmarkdown[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0

In [4]:
import os
os.path.exists('DejaVuSerifCondensed.ttf')

True