In [1]:
from sklearn.cluster import KMeans
import requests, pandas as pd
import numpy as np
import colorsys, os, math
from matplotlib import colors
import time, sqlite3, datetime
from sklearn import preprocessing

In [2]:
def parseDF(url, json_column, drop_columns):   
    
    # fetch json
    allIndiaData = requests.get(url).json()
    
    # extract data from json into dataframe
    stateCoronaDfs=[]
    for statename, stateData in allIndiaData.items():
        
        stateCoronaDf=pd.DataFrame(stateData)
        # normalize json columns and split them into individual columns
        normalizedColumns=pd.json_normalize(stateCoronaDf[json_column])
        stateCoronaDf.drop(columns=[json_column], inplace=True)
        normalizedColumns.index=stateCoronaDf.index
        stateCoronaDf[normalizedColumns.columns]=normalizedColumns[normalizedColumns.columns]
        if len(drop_columns):
            stateCoronaDf.drop(columns=drop_columns, inplace=True)
        stateCoronaDf.statecode=statename
        stateCoronaDfs.append(stateCoronaDf)    
    
    # final dataframe
    coronaDf=pd.concat(stateCoronaDfs)
    coronaDf['Constituency']=coronaDf.index
    coronaDf.reset_index(inplace=True, drop=True)    
    
    coronaDf['Constituency']=coronaDf['Constituency'].str.lower()
    coronaDf['statecode']=coronaDf['statecode'].str.lower()
    
#     coronaDf.set_index(["statecode", "Constituency"], inplace = True,
#                             append = False, drop = True, verify_integrity=True)


    # drop row if active case is negative, nan or infinity
    coronaDf=coronaDf[coronaDf.active>=0]
    coronaDf=coronaDf[np.isfinite(coronaDf.active)]
    coronaDf=coronaDf[~coronaDf.active.isna()]
    
    # coronaDf.to_excel('hack.xlsx')
    return coronaDf


def _get_best_k(Ks, Costs):
    
    # elbow-method for best K selection
    points=np.array(list(zip(Ks, Costs)))
    A=points[0]
    B=points[-1]    
    distFromLine_AB = [np.linalg.norm(np.cross(B-A, A-point))/np.linalg.norm(B-A) 
                 for point in points]
    
    return int(Ks[np.argmax(distFromLine_AB)])


def choose_colors(num_colors):
    
    colors=[]
    for i in np.arange(0., 360., 360. / num_colors):
        hue = i/360.
        lightness = 0.85
        saturation = (90 + np.random.rand() * 10)/100.
        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
    return np.array(colors)


def cluster_risk(risks, max_cluster):
    
    risks=np.array(risks).reshape(-1, 1)    
    Ks=list(range(2, max_cluster+1))
    Costs=[]    
    KMeansModels={}

    # dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
    for k in Ks:
        classifier = KMeans(n_clusters = k)
        classifier.fit(risks)
        Costs.append(classifier.inertia_)
        KMeansModels[k]=classifier

    k_optimum=_get_best_k(Ks, Costs)
    colors=choose_colors(k_optimum)
    labels=KMeansModels[k_optimum].predict(risks)
    
    return colors[labels], KMeansModels[k_optimum].cluster_centers_[labels]


def normalizeColumns(col):
    
    loglog=np.log2(np.array(col)+1)
    normalized = preprocessing.MinMaxScaler(feature_range=(0.3, 0.9)).fit_transform(
        preprocessing.StandardScaler().fit_transform(loglog.reshape(-1, 1)))
    return normalized


def rankClusters(arr):
    # assign priority to cluster means of active cases
    clusterCenters=np.unique(arr).tolist()
    priorities=list(range(len(clusterCenters)))
    priorityDict=dict(list(zip(clusterCenters, priorities)))
    return np.array([priorityDict[a] for a in arr])

calcrisk=lambda pop, active: active*np.log10(pop+1)

def assignPriority(ranks, populations, actives):
    
    allRows=pd.DataFrame(zip(ranks, populations, actives), 
                    columns=["rank", "population", "active"])    
    allRows['risk']=calcrisk(np.array(populations), np.array(actives))
    allRows=allRows[['rank', 'risk']]
    
    normalizers={rank:preprocessing.MinMaxScaler(feature_range=(0.0, 0.5)).fit(riskDf[['risk']]) 
                 for rank, riskDf in allRows.groupby('rank')}
    
#     allRows['Population']=populations
#     allRows['active']=actives
#     allRows['prior']=(allRows['rank'] + allRows.apply(lambda rowval: 
#         normalizers[rowval[0]].transform([[rowval[1]]])[0][0], axis=1)).tolist()   
#     allRows.to_csv('debugger.csv')
    
    return (allRows['rank'] + allRows.apply(lambda rowval: 
        normalizers[rowval[0]].transform([[rowval[1]]])[0][0], axis=1)).tolist()   


def getDistrictPopulationDf(censusFile):
    # get population data from census
    df=pd.read_csv(censusFile)
    df['District name']=df['District name'].str.lower()
    df['State name']=df['State name'].str.lower()
    return df


def fillpopulationData(coronaDf, censusDf):
    # fill population data from census data to corona df
    def PopulationOfStateNDistrict(state, district):
        valdf=censusDf[(censusDf['District name']==district) & (censusDf['State name']==state)]
        if valdf.empty:
            return None
        return valdf.Population.tolist()[0]
    
    # get population
    populationWithNA = pd.Series(coronaDf[['statecode', 'Constituency']].apply(
        lambda row : PopulationOfStateNDistrict(row[0], row[1]), axis=1))
    
    # replace unavailable population with remaining population for corresponding state
    populationOfState={statename:stateDf.Population.sum() 
        for statename, stateDf in censusDf.groupby('State name')}
    
    nullCountOfState={statename:len(stateDf)
        for statename, stateDf in coronaDf[populationWithNA.isna()].groupby('statecode')}   
    
    meanvalToFillForState={statename:populationOfState.get(statename, 0)//nullCount 
        for statename, nullCount in 
                nullCountOfState.items() if nullCount}
    populationwithZeroVal=pd.Series([p if not(math.isnan(p)) 
     else meanvalToFillForState[coronaDf.statecode.iloc[i]] for i, p in enumerate(populationWithNA)])
    meanpop=populationwithZeroVal[populationwithZeroVal>0].mean()
    populationwithZeroVal=populationwithZeroVal.replace(0, int(meanpop))
    return populationwithZeroVal.tolist()


def dataProcessing(intervalInMinutes, database, max_cluster, 
                   API, censusFile, normalize_columns=[]):
    
    global globalDfhandler
    Pass=0
    while True:        
        # parse df
        coronaDf=parseDF(url=API, json_column='districtData', 
                        drop_columns=['notes', 'delta.confirmed', 'delta.deceased', 'delta.recovered'])
        
        # clustering
        activeCases = coronaDf.active.to_numpy(dtype='float64')
        cololabels, centers = cluster_risk(activeCases, max_cluster=max_cluster)
                
        # add more columns
        for col in normalize_columns:
            coronaDf[f'{col}-normalized'] = normalizeColumns(coronaDf[col])
            
        # handle population data
        censusDf=getDistrictPopulationDf(censusFile)
        coronaDf['Population']=fillpopulationData(coronaDf, censusDf)
        
        # assign priority
        ranks=rankClusters(centers.reshape(-1))
        coronaDf['priority']=ranks
        coronaDf['tuned-priority']=assignPriority(
            coronaDf['priority'].tolist(), coronaDf['Population'].tolist(), coronaDf['active'].tolist())
        
        # cluster-color
        darknessForCluster=preprocessing.MinMaxScaler(feature_range=(0, 0.7)
            ).fit_transform(np.array(coronaDf['priority']).reshape(-1, 1))[:, 0]
        darknessForSubcluster=(coronaDf['tuned-priority'].to_numpy()%1)*0.3
        cololabels[:, 1]=1-darknessForCluster-darknessForSubcluster
        coronaDf['color']=np.apply_along_axis(colors.to_hex, 1, cololabels)
        
        if not os.path.isdir(os.path.dirname(database)):
            os.makedirs(os.path.dirname(database))
            
        cnx = sqlite3.connect(database)
        coronaDf.to_sql(name='main', con=cnx, if_exists='replace')
        cnx.close()
        globalDfhandler=coronaDf.copy()
        
        try:
            coronaDf.to_excel('debug.xlsx')
        except:
            pass
        
        print(f'{datetime.datetime.now()}: {Pass+1}th updation done.')
        Pass=Pass+1
        time.sleep(intervalInMinutes*60)
    
#     plt.ylim(-10, 10)
#     for casecount, label in zip(activeCases, labels):
#         plt.scatter(casecount, 0.0, color=label)
#     plt.show()

In [3]:
if __name__ == '__main__':
    dataProcessing(intervalInMinutes=0.2, database='../Database/corona.db', 
                   max_cluster=10, API='https://api.covid19india.org/state_district_wise.json',
                  normalize_columns=['active', 'deceased', 'recovered', 'confirmed'],
                  censusFile='../Census Data/census-2011.csv')

2020-10-03 16:02:29.481828: 1th updation done.
2020-10-03 16:02:44.784638: 2th updation done.
2020-10-03 16:02:59.968238: 3th updation done.


KeyboardInterrupt: 

In [None]:
if __name__ == '__main__':
#     globalDfhandler.plot(x=['tuned-priority'], y=['tuned-priority'], color=globalDfhandler.color, kind='scatter')
    globalDfhandler.plot(y=['active-normalized'])