In [None]:
import numpy as np 
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# **Basic data handling and overview**

In [None]:
data = pd.read_csv('../input/latest-covid19-cases-maharashtra-india/Maharashtra Latest Covid Cases.csv')
data.head()

In [None]:
# Clean column names for further usage
data.columns = data.columns.str.lower().str.replace(r"[^a-zA-Z]", " ", regex=True)\
.str.strip().str.replace(" ", "_")

data.index = data.districts
del data["districts"]

In [None]:
data.info()

# **Check pairwise correlation of columns**

In [None]:
corr = data.corr()
fig = go.Figure(data=go.Heatmap(z=corr, x=data.columns, y=data.columns))
fig.show()

There is high linear dependence between following pairs:
- number of positive cases and number of deceased;
- number of positive cases and number of recovered;
- number of positive cases and number of active cases;
- number of recovered and number of deceased.


# **Plotting dependencies**

In [None]:
fig = make_subplots(rows=2, cols=1)

fig.add_trace(go.Scatter(x=data.positive_cases, y=data.recovered, 
                         mode="markers", text=data.index), row=1, col=1)

fig.add_trace(go.Scatter(x=data.positive_cases, y=data.deceased, 
                         mode="markers", text=data.index), row=2, col=1)

fig.update_xaxes(title_text="Number of positive cases", row=1, col=1)
fig.update_xaxes(title_text="Number of positive cases", row=2, col=1)

fig.update_yaxes(title_text="Number of recovered", row=1, col=1)
fig.update_yaxes(title_text="Number of deceased", row=2, col=1)


title = "Dependency between number of positive cases and number of recovered\deceased"
fig.update_layout(showlegend=False, height=700, width=1000, title_text=title)
fig.show()

# **Number of positive cases \ recovery rate \ fatality rate plot**

In [None]:
fig = px.scatter_3d(data, x='positive_cases', y='recovery_rate', z='fatality_rate',
              color='fatality_rate', text=data.index)


fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

# **Clustering**

## Data preparations: scaling, dimensionality reduction

In [None]:
scaler = StandardScaler()
pca = PCA(2)

data_scaled = scaler.fit_transform(data)
df_transformed = pca.fit_transform(data_scaled)

## Choose number of K for KMeans clustering 

In [None]:
sse_vals = []
silhouette_vals = []
n_clusters = np.arange(2, 10)
for temp in n_clusters:
    kmeans = KMeans(n_clusters=temp)
    kmeans.fit(data_scaled)
    sse_vals.append(kmeans.inertia_)
    silhouette_vals.append(silhouette_score(data_scaled, kmeans.predict(data_scaled)))
    
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=n_clusters, y=sse_vals, name="SSE"),
    secondary_y=False)

fig.add_trace(
    go.Scatter(x=n_clusters, y=silhouette_vals, name="Silhouette"),
    secondary_y=True)

fig.update_xaxes(title_text="Number of clusters")
fig.show()

## According to SSE and inertia, optimal number of clusters is 5.

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(data_scaled)

fig = go.Figure(data=go.Scatter(x=df_transformed[:, 0], y=df_transformed[:, 1], 
                                mode='markers',
                                marker=dict(color=kmeans.labels_, size=data.positive_cases / 8000),
                                text=data.index))
fig.update_xaxes(title_text="new_feature_1")
fig.update_yaxes(title_text="new_feature_2")
fig.update_layout(height=600, width=900)
fig.show()