In [1]:
import pandas as pd
import numpy as np
import camelot
import pdb
from pathlib import Path

In [5]:
DATA = Path('data')
NHRR = DATA/'nhrr'/'NHRR2019.pdf'

### Extract Medical College Data

In [7]:
med_clgs = camelot.read_pdf(str(NHRR), pages='270-282', flavor='lattice')

In [8]:
med_clgs[0].parsing_report

{'accuracy': 100.0, 'whitespace': 14.29, 'order': 1, 'page': 270}

In [9]:
def extract_table(df, drop_rows=[0]):
    df.columns = df.iloc[0]
    df.drop(df.index[drop_rows], inplace=True)
    df.columns = [c.replace(' \n', '') for c in df.columns]
    df = df[df['S.No.'] != '']
    df.set_index(keys='S.No.', inplace=True)
    
    return df

In [10]:
med_clgs_df = pd.concat([extract_table(med_clgs[i].df) for i in range(13)])

In [11]:
med_clgs_df

Unnamed: 0_level_0,State/UT,Name of Medical College,City/Town,Govt/Private,AdmissionCapacity,No. of beds in AttachedHospital
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Andaman & \nNicobar Islands,Andaman & Nicobar Islands Insitute of Medical ...,Port Blair,Govt.,100,460
2,Andhra Pradesh,ACSR Government Medical College Nellore,Nellore,Govt.,150,750
3,,"All India Institute of Medical Sciences, Manga...",Vijaywada,Govt.,50,
4,,Alluri Sitaram Raju Academy of Medical Science...,Eluru,Trust,150,1070
5,,"Andhra Medical College, Visakhapatnam",Visakhapatnam,Govt.,200,2017
...,...,...,...,...,...,...
525,,"North Bengal Medical College, Darjeeling",Siliguri,Govt.,150,599
526,,Raiganj Government Medical College & Hospital,Raiganj,Govt.,,
527,,Rampurhat Government Medical College & Hospital,Rampurhat,Govt.,,
528,,"RG Kar Medical College, Kolkata",Kolkata,Govt.,200,1210


In [12]:
med_clgs_df['State/UT'] = med_clgs_df['State/UT'].replace(r'^\s*$', np.nan, regex=True)\
                                                 .ffill()\
                                                 .str.replace('\n', '')

In [13]:
med_clgs_df.head()

Unnamed: 0_level_0,State/UT,Name of Medical College,City/Town,Govt/Private,AdmissionCapacity,No. of beds in AttachedHospital
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Andaman & Nicobar Islands,Andaman & Nicobar Islands Insitute of Medical ...,Port Blair,Govt.,100,460.0
2,Andhra Pradesh,ACSR Government Medical College Nellore,Nellore,Govt.,150,750.0
3,Andhra Pradesh,"All India Institute of Medical Sciences, Manga...",Vijaywada,Govt.,50,
4,Andhra Pradesh,Alluri Sitaram Raju Academy of Medical Science...,Eluru,Trust,150,1070.0
5,Andhra Pradesh,"Andhra Medical College, Visakhapatnam",Visakhapatnam,Govt.,200,2017.0


In [14]:
med_clgs_df.to_csv(DATA/'medical_college_list.csv', index=False)

### Extract Doctors Data

In [15]:
doctors = camelot.read_pdf(str(NHRR), pages='245', flavor='lattice')
doctors_df = extract_table(doctors[0].df)

doctors_df.to_csv(DATA/'doctors.csv', index=False)

In [16]:
def extract_table_govt(df):
    df.columns = df.iloc[0]
    df.drop(df.index[0], inplace=True)
    df.columns = [c.replace(' \n', '') for c in df.columns]
    df = df[df['S.No'] != '']
    df.set_index(keys='S.No', inplace=True)
    
    return df

In [17]:
govt_doctors = camelot.read_pdf(str(NHRR), pages='250', flavor='lattice')
govt_doctors_df = extract_table_govt(govt_doctors[0].df)

govt_doctors_df.to_csv(DATA/'govt_doctors.csv', index=False)

### Extract Nurses Data

In [18]:
def extract_table_nurses(df):
    df.columns = df.iloc[0]
    df.drop(df.index[0], inplace=True)
    df.columns = [c.replace(' \n', '') for c in df.columns]
    df = df[df['S. No'] != '']
    df.set_index(keys='S. No', inplace=True)
    
    return df

In [19]:
nurses = camelot.read_pdf(str(NHRR), pages='254', flavor='lattice')
nurses_df = extract_table_nurses(nurses[0].df)

nurses_df.columns = ['State/UT', 'ANM', 'RN & RM', 'LH V', 'Pharmacists']

nurses_df.to_csv(DATA/'nurses.csv', index=False)

In [20]:
nurses_df.head()

Unnamed: 0_level_0,State/UT,ANM,RN & RM,LH V,Pharmacists
S. No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Andhra Pradesh*,138435,232621,2480,50247
2,Arunachal Pradesh,971,938,15,279
3,Assam,27925,22388,353,15462
4,Bihar*,8624,9413,511,24341
5,Chattisgarh*,13329,13048,1352,9716


### Extract Population Data

In [21]:
pop = camelot.read_pdf(str(NHRR), pages='36', flavor='lattice')
pop_df = extract_table(pop[0].df)
pop_df.columns = ['State/UT', '2018-Persons', '2018-Males', '2018-Females', '2019-Persons', '2019-Males', '2019-Females', '2020-Persons', '2020-Males', '2020-Females']

pop_df.to_csv(DATA/'population.csv', index=False)

In [22]:
def extract_table_age(df):
    df.columns = df.iloc[0]
    df.drop(df.index[[0, 1]], inplace=True)
    df.reset_index(inplace=True, drop=True)
    
    return df

In [23]:
age = camelot.read_pdf(str(NHRR), pages='40', flavor='lattice')
age_df = extract_table_age(age[0].df)
age_df.columns = ['Age Group', 'Total', 'Total-Males', 'Total-Females', 'Rural-Total', 'Rural-Males', 'Rural-Females', 'Urban-Total', 'Urban-Males', 'Urban-Females']

age_df.to_csv(DATA/'pop_by_age_2017.csv', index=False)

### Death by pneumonia

In [24]:
pneumonia = camelot.read_pdf(str(NHRR), pages='139', flavor='lattice')
pneumonia_df = extract_table(pneumonia[0].df, drop_rows=[0,1])
pneumonia_df.columns = ['State/UT.', 'Male-Cases', 'Male-Deaths', 'Female-Cases', 'Female-Deaths', 'Total-Cases', 'Total-Deaths']

pneumonia_df.to_csv(DATA/'pneumonia_2018.csv', index=False)

### Death by Acute Respiratory Infection

In [25]:
ari = camelot.read_pdf(str(NHRR), pages='121', flavor='lattice')
ari_df = extract_table(ari[0].df, drop_rows=[0,1])
ari_df.columns = ['State/UT.', 'Male-Cases', 'Male-Deaths', 'Female-Cases', 'Female-Deaths', 'Total-Cases', 'Total-Deaths']

ari_df.to_csv(DATA/'ari_2018.csv', index=False)

In [26]:
ari_df.head()

Unnamed: 0_level_0,State/UT.,Male-Cases,Male-Deaths,Female-Cases,Female-Deaths,Total-Cases,Total-Deaths
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Andhra Pradesh,1577483,380,1572414,207,3149897,587
2,Arunachal Pradesh,17723,0,15945,0,33668,0
3,Assam,13634,182,10466,79,24100,261
4,Bihar,479878,0,496303,2,976181,2
5,Chhattisgarh,234641,12,224187,3,458828,15
