In [1]:
import re
from collections import defaultdict
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
hosp_dir = '/mnt/d/books/iitm/agentBased/data/tn/covid_war_room/positive_cases/'
pat_file = '/mnt/d/books/iitm/agentBased/data/tn/covid_war_room/patient_data/combined_csv.csv'

In [4]:
# hdf = pd.read_excel(join(hosp_dir, 'NCDC_ICMR-report_1_To_50000_2021-10-21 15_29_42.xlsx'))
list_df = []
cnt = 0
for f in listdir(hosp_dir):
    filepath = join(hosp_dir, f)
    if isfile(filepath):
        tmp_df = pd.read_excel(filepath)
        list_df.append(tmp_df)
        cnt += 1
    if cnt == 2:
        break

In [5]:
hdf = pd.concat(list_df, ignore_index=True)

In [6]:
hdf['NCDC District'].unique()

array(['CHENGALPATTU', 'CHENNAI'], dtype=object)

In [7]:
not_nan_df = hdf[hdf['Present Address'].notna()]
len(not_nan_df)

94787

In [8]:
district_path = '/mnt/d/books/iitm/agentBased/codes/covasim/models/data/tn_districts.txt'
district_map, district_pop = {}, {}
with open(district_path, 'r') as fp:
    for line in fp:
        line = line.strip()
        if not line:
            continue
        keys = [x.strip().lower() for x in line.split(',')]
        district_pop[keys[0]] = int(keys[-3])
        d = keys[:-3]
        district_map[d[0]] = d[0]
        for i in range(1, len(d)):
            district_map[d[i]] = d[0]

In [9]:
def create_word_vector_for_gen(sen, from_pos=1):
    if pd.isnull(sen):
        return set()
    sen = str(sen)
    imp_keys = sen.strip().split(',')
    if len(imp_keys) == 0:
        return set()
    if len(imp_keys) > 1:
        imp_keys = imp_keys[from_pos:]
    imp_keys = [re.sub(r"[,;\-_\'\":\.\?]", " ", x).strip() for x in imp_keys]
    unigram = []
    for k in imp_keys:
        w = [x.strip().lower() for x in k.split(' ') if x.strip()]
        unigram.extend(w)
    bigram = []
    for i in range(len(unigram)-1):
        bigram.append(f'{unigram[i]}_{unigram[i+1]}')
    keys = set(unigram + bigram)
    return keys

In [10]:
def create_word_vector_for_est(sen):
    if pd.isnull(sen):
        return set()
    sen = str(sen)
    imp_keys = sen.strip().split(',')
    if len(imp_keys) == 0:
        return set()
    if len(imp_keys) > 1:
        imp_keys = imp_keys[-1:]
    imp_keys = [re.sub(r"[,;\-_\'\":\.\?]", " ", x).strip() for x in imp_keys]
    unigram = []
    for k in imp_keys:
        w = [x.strip().lower() for x in k.split(' ') if x.strip()]
        unigram.extend(w)
    unigram = unigram[-3:]
    bigram = []
    for i in range(len(unigram)-1):
        bigram.append(f'{unigram[i]}_{unigram[i+1]}')
    keys = set(unigram + bigram)
    return keys

In [11]:
def get_possible_district(address, district_map, from_pos):
    words = create_word_vector_for_gen(address, from_pos=from_pos)
    possible_districts = [district_map[x] for x in district_map.keys() if x in words]
    return (words, possible_districts)


def get_district(address_districts, facility_districts, present_districts, district_pop):
    def population_based(possible_districts, district_pop):
        dist = ""
        for d in possible_districts:
            if (dist == "") or (district_pop[dist] < district_pop[d]):
                dist = d
        return dist

    dist = []
    for ad in address_districts:
        for pd in present_districts:
            if pd == ad:
                dist.append(ad)
        for fd in facility_districts:
            if fd in dist:
                return fd
            if fd == ad:
                dist.append(fd)
    if len(dist) > 0:
        return dist[0]
    
    if len(address_districts) > 0:
        return population_based(address_districts, district_pop)
    elif len(present_districts) > 0:
        return population_based(present_districts, district_pop)
    else:
        return population_based(facility_districts, district_pop)


def create_word_frequency(df, district_map, district_pop):
    word_freq, dist_cnt = {}, defaultdict(int)
    known_district, unknown_district = [], []
    for idx, row in df.iterrows():
        address_words, address_districts = get_possible_district(row['Present Address'], district_map, from_pos=-1)

        facility_words, facility_districts = get_possible_district(row['Facility where Patient Admitted'], district_map, from_pos=0)

        present_words, present_districts = get_possible_district(row['Present Village Town'], district_map, from_pos=0)

        if not (address_districts or facility_districts or present_districts):
            unknown_district.append(idx)
            continue

        dist = get_district(address_districts, facility_districts, present_districts, district_pop)
        
        words = address_words.union(present_words)
        for w in words:
            if w not in word_freq:
                word_freq[w] = defaultdict(int)
            word_freq[w][dist] += 1
        if isinstance(dist, list):
            print(dist)
            print(f'{idx}')
            raise Exception('break')
        dist_cnt[dist] += 1
        known_district.append((idx, dist))

    for k, v in word_freq.items():
        for d in v.keys():
            v[d] /= dist_cnt[d]

    total_cnt = sum(dist_cnt.values())
    for k in dist_cnt.keys():
        dist_cnt[k] /= total_cnt

    return (word_freq, dist_cnt, known_district, unknown_district)

In [12]:
def estimate_district(df, unknown, word_prob, dist_prob):
    estimated, new_unknown = [], []
    for idx in unknown:
        address_words = create_word_vector_for_est(df.loc[idx]['Present Address'])
        present_words = create_word_vector_for_est(df.loc[idx]['Present Village Town'])
        words = address_words.union(present_words)
        dist, logit = "", -np.inf
        for d in dist_prob.keys():
            logp = 0
            for w in words:
                if (w not in word_prob) or (word_prob[w][d] == 0):
                    logp += -100
                else:
                    logp += np.log(word_prob[w][d])
            if logp == -100*len(words):
                continue
            logp += np.log(dist_prob[d])
            if logp > logit:
                dist, logit = d, logp
        if dist == "":
            new_unknown.append(idx)
        else:
            estimated.append((idx, dist))
    return (estimated, new_unknown)

In [13]:
def add_new_column_district(df, known, estimated):
    k, total_k = 0, len(known)
    e, total_e = 0, len(estimated)
    pos, total = 0, len(df)
    districts = []
    while k < total_k or e < total_e or pos < total:
        if k < total_k and known[k][0] == pos:
            districts.append(known[k][1])
            k += 1
        elif e < total_e and estimated[e][0] == pos:
            districts.append(estimated[e][1])
            e += 1
        else:
            districts.append(np.nan)
        pos += 1
    return districts

In [14]:
word_freq, dist_cnt, known_district, unknown_district = create_word_frequency(not_nan_df, district_map, district_pop)

In [15]:
len(known_district) / len(not_nan_df)

0.2548661736314051

In [16]:
est, un = estimate_district(hdf, unknown_district, word_freq, dist_cnt)

In [17]:
len(un)

5829

In [18]:
districts = add_new_column_district(hdf, known_district, est)

In [19]:
hdf['estimated_district'] = districts

In [20]:
set(districts)

{'ariyalur',
 'chengalpet',
 'chennai',
 'coimbatore',
 'cuddalore',
 'dharmapuri',
 'dindigul',
 'erode',
 'kallakurichi',
 'kancheepuram',
 'kanyakumari',
 'karur',
 'krishnagiri',
 'madurai',
 'mayiladuthurai',
 'nagapattinam',
 'namakkal',
 nan,
 'nilgiris',
 'perambalur',
 'pudukottai',
 'ramanathapuram',
 'ranipet',
 'salem',
 'sivagangai',
 'tenkasi',
 'thanjavur',
 'theni',
 'thirunelveli',
 'thiruvallur',
 'thiruvarur',
 'tirupathur',
 'tiruppur',
 'tiruvannamalai',
 'trichirappalli',
 'tuticorin',
 'vellore',
 'viluppuram',
 'virudhunagar'}

In [22]:
hdf[hdf['estimated_district'] == 'tuticorin'][['NCDC District', 'Present Address', 'Present Village Town', 'Facility where Patient Admitted']].head(n=50)

Unnamed: 0,NCDC District,Present Address,Present Village Town,Facility where Patient Admitted
32270,CHENGALPATTU,THOOTHUKUDI,,
45070,CHENGALPATTU,THOOTHUKUDI,,
85443,CHENNAI,"2/100/2,SIVAKALAI MAIN ROAD,TUTY",,THOOTHUKUDI MEDICAL COLEGE HOSPITAL
85448,CHENNAI,"2/100/2,SIVAKALAI MAIN ROAD,TUTY",,THOOTHUKUDI MEDICAL COLEGE HOSPITAL
86392,CHENNAI,KAYALPATTINAM,TUTICORIN,THOOTHUKUDI MEDICAL COLEGE HOSPITAL
86397,CHENNAI,KAYALPATTINAM,TUTICORIN,THOOTHUKUDI MEDICAL COLEGE HOSPITAL
86838,CHENNAI,"5/250,deepan geetha nagar,muthiah puram",,THOOTHUKUDI MEDICAL COLEGE HOSPITAL
86918,CHENNAI,"10/50,EAST ST,OTTAPIDARAM,TUTY",,THOOTHUKUDI MEDICAL COLEGE HOSPITAL
86954,CHENNAI,"10/50,east st,ottapidaram tuty",,THOOTHUKUDI MEDICAL COLEGE HOSPITAL
87173,CHENNAI,"132, SEIDUNGANALLUR",TUTICORIN,THOOTHUKUDI MEDICAL COLEGE HOSPITAL


In [None]:
hdf[hdf['Facility where Patient Admitted'] == 'SR Hospital,Ariyalur'][['NCDC District', 'Present Address', 'Present Village Town', 'estimated_district']]

In [None]:
hdf.columns

In [None]:
hdf['Present Village Town'].unique()

In [None]:
a = []
b,c  = [], []
if not (a or b or c):
    print('OK')
