In [1]:
import re
from collections import defaultdict
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
hosp_dir = '/mnt/d/books/iitm/agentBased/data/tn/covid_war_room/positive_cases/'
pat_file = '/mnt/d/books/iitm/agentBased/data/tn/covid_war_room/patient_data/combined_csv.csv'

In [4]:
# hdf = pd.read_excel(join(hosp_dir, 'NCDC_ICMR-report_1_To_50000_2021-10-21 15_29_42.xlsx'))
list_df = []
cnt = 0
for f in listdir(hosp_dir):
    filepath = join(hosp_dir, f)
    if isfile(filepath):
        tmp_df = pd.read_excel(filepath)
        list_df.append(tmp_df)
        cnt += 1
    if cnt == 2:
        break

In [5]:
hdf = pd.concat(list_df, ignore_index=True)


100000

In [20]:
hdf['NCDC District'].unique()

array(['CHENGALPATTU', 'CHENNAI'], dtype=object)

In [6]:
not_nan_df = hdf[hdf['Present Address'].notna()]
len(not_nan_df)

94787

In [7]:
district_path = '/mnt/d/books/iitm/agentBased/codes/covasim/models/data/tn_districts.txt'
district_map, district_pop = {}, {}
with open(district_path, 'r') as fp:
    for line in fp:
        line = line.strip()
        if not line:
            continue
        keys = [x.strip().lower() for x in line.split(',')]
        district_pop[keys[0]] = int(keys[-3])
        d = keys[:-3]
        district_map[d[0]] = d[0]
        for i in range(1, len(d)):
            district_map[d[i]] = d[0]

In [8]:
def create_word_vector(sen):
    sen = str(sen)
    imp_keys = sen.strip().split(',')
    if len(imp_keys) == 0:
        return set()
    if len(imp_keys) > 1:
        imp_keys = imp_keys[1:]
    imp_keys = [re.sub(r"[,;\-_\'\":\.\?]", " ", x).strip() for x in imp_keys]
    unigram = []
    for k in imp_keys:
        w = [x.strip().lower() for x in k.split(' ') if x.strip()]
        unigram.extend(w)
    bigram = []
    for i in range(len(unigram)-1):
        bigram.append(f'{unigram[i]}_{unigram[i+1]}')
    keys = set(unigram + bigram)
    return keys

In [9]:
def create_word_frequency(df, district_map, district_pop):
    word_freq, dist_cnt = {}, defaultdict(int)
    known_district, unknown_district = [], []
    for idx, row in df.iterrows():
        address = row['Present Address']
        words = create_word_vector(address)
        possible_districts = [district_map[x] for x in district_map.keys() if x in words]
        if not possible_districts:
            unknown_district.append(idx)
            continue
        dist, ncdc_dist = "", row['NCDC District']
        if len(possible_districts) > 1 and (not pd.isnull(ncdc_dist)) and (ncdc_dist.lower() in possible_districts):
            dist = ncdc_dist.lower()
        else:
            for d in possible_districts:
                if (dist == "") or (district_pop[dist] < district_pop[d]):
                    dist = d
        for w in words:
            if w not in word_freq:
                word_freq[w] = defaultdict(int)
            word_freq[w][dist] += 1
        dist_cnt[dist] += 1
        known_district.append((idx, dist))
    
    for k, v in word_freq.items():
        for d in v.keys():
            v[d] /= dist_cnt[d]
    
    total_cnt = sum(dist_cnt.values())
    for k in dist_cnt.keys():
        dist_cnt[k] /= total_cnt

    return (word_freq, dist_cnt, known_district, unknown_district)

In [10]:
def get_district(df, unknown, word_prob, dist_prob):
    estimated, new_unknown = [], []
    for idx in unknown:
        address = df.loc[idx]['Present Address']
        words = create_word_vector(address)
        dist, logit = "", -np.inf
        for d in dist_prob.keys():
            logp = 0
            for w in words:
                if (w not in word_prob) or (word_prob[w][d] == 0):
                    logp += -100
                else:
                    logp += np.log(word_prob[w][d])
            if logp == -100*len(words):
                continue
            logp += np.log(dist_prob[d])
            if logp > logit:
                dist, logit = d, logp
        if dist == "":
            new_unknown.append(idx)
        else:
            estimated.append((idx, dist))
    return (estimated, new_unknown)

In [11]:
def add_new_column_district(df, known, estimated):
    k, total_k = 0, len(known)
    e, total_e = 0, len(estimated)
    pos, total = 0, len(df)
    districts = []
    while k < total_k or e < total_e or pos < total:
        if k < total_k and known[k][0] == pos:
            districts.append(known[k][1])
            k += 1
        elif e < total_e and estimated[e][0] == pos:
            districts.append(estimated[e][1])
            e += 1
        else:
            districts.append(np.nan)
        pos += 1
    return districts

In [12]:
word_freq, dist_cnt, known_district, unknown_district = create_word_frequency(not_nan_df, district_map, district_pop)

In [25]:
len(known_district) / len(not_nan_df)

0.18232458037494592

In [13]:
est, un = get_district(hdf, unknown_district, word_freq, dist_cnt)

In [14]:
districts = add_new_column_district(hdf, known_district, est)

In [15]:
hdf['estimated_district'] = districts

In [40]:
set(districts)

{'ariyalur',
 'chengalpet',
 'chennai',
 'coimbatore',
 'cuddalore',
 'dharmapuri',
 'dindigul',
 'erode',
 'kallakurichi',
 'kancheepuram',
 'kanyakumari',
 'karur',
 'krishnagiri',
 'madurai',
 'mayiladuthurai',
 'nagapattinam',
 'namakkal',
 nan,
 'nilgiris',
 'perambalur',
 'pudukottai',
 'ramanathapuram',
 'ranipet',
 'salem',
 'sivagangai',
 'tenkasi',
 'thanjavur',
 'theni',
 'thirunelveli',
 'thiruvallur',
 'thiruvarur',
 'tirupathur',
 'tiruppur',
 'tiruvannamalai',
 'tuticorin',
 'vellore',
 'viluppuram',
 'virudhunagar'}

In [23]:
hdf[hdf['estimated_district'] == 'viluppuram'][['NCDC District', 'Present Address']]

Unnamed: 0,NCDC District,Present Address
283,CHENGALPATTU,"THERADI ST , KEZHMAVILANU THINDIVANAM , VILLUPURAM"
725,CHENGALPATTU,"98, PILLAYAR KOIL ST , PULLUR"
1416,CHENGALPATTU,VILLUPURAM THINDIVANAM
1502,CHENGALPATTU,VILLUPURAM
2051,CHENGALPATTU,BULDING WORKER THINDIVANAM
...,...,...
94492,CHENNAI,KOTTAKARAI VILLUPURAM TAMIL NADU
94522,CHENNAI,EZHIL NAGAR
97685,CHENNAI,"184, TVM PILLAYAR KOIL ST"
99019,CHENNAI,"35,EZHIL NAGAR. 600097CHENNAI, TAMILNADU, INDIAN"


In [42]:
hdf.columns

Index(['NCDC State', 'NCDC District', 'Surveillance Id', 'Test Id (ICMR)',
       'Patient Age', 'Age In', 'Gender', 'Nationality',
       'Present Village Town', 'Present Address', 'Lab Name', 'Resp Inf',
       'Under Med Con', 'Travel History', 'Date Of Arrival In India',
       'Symptoms', 'Date Of Sample Tested', 'Date Of Onset Of Symptoms',
       'Patient Status Code', 'Facility where Patient Admitted', 'Camp Code',
       'State Code', 'District Code', 'Family Admitted Code', 'Outcome Code',
       'Migrated Country', 'Date Isolation', 'Total Contacts', 'Validated',
       'estimated_district'],
      dtype='object')

In [None]:
hdf['Date Isolation'].unique()