# India Rainfall Analysis

Here we have Rainfall data for 100 years, data includes monthly rainfall in mm for all the states and union territories of india. 

Though we could work out a time series prediction model but given the small number of datapoints and the variability in the data, the accuracy will not be good enough.

Here I have worked out some simple analyses that could be done to dig out usefull information about the patterns of rainfall here in India

In [1]:
import pandas as pd
df=pd.read_csv("../input/rainfall-data-from-1901-to-2017-for-india/Rainfall_Data_LL.csv")

In [1]:
df.head()

## Heatmap plot 
location closer to the himalayas recieve more rainfall compared to the distant states

In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
img=mpimg.imread('../input/india-background-map/india-outline-map_2.jpeg')

def vis(year,month,df):
    d=df[df["YEAR"]==year]
    d.plot(kind='scatter', x='Longitude', y='Latitude', alpha=1,
                      label='rainfall', figsize=(10,7),
                      c=month, cmap=plt.get_cmap('hot'), colorbar=True,s=90)
    plt.imshow(img, extent=[67.4, 98, 6, 38], alpha=1)
    plt.ylabel("Latitude", fontsize=14)
    plt.xlabel("Longitude", fontsize=14)
    plt.legend() 
    plt.show()
vis(1901,"JAN",df)    

## Decrease or Increase in Rainfall over the years

Slope of the best fit line will give us the rate of decrease in annual rainfall in (mm/year).

In [1]:
import seaborn as sns
def trend(df,subdivision):
    d=df[df["SUBDIVISION"]==subdivision]
    d=d[["YEAR","ANNUAL"]]
    sns.set(rc={'figure.figsize':(11.7,8.27)})
    #sns.scatterplot(data=d, x="YEAR", y="ANNUAL",linewidth = 3,color='r',s=100,alpha=1)
    sns.regplot(x="YEAR", y="ANNUAL", data=d);

trend(df,"Arunachal Pradesh")
    

## Rate for all Subdivisions


In [1]:
from scipy.optimize import curve_fit
def objective(x, a, b):
    return a * x + b

def decrease(df,subdivision):
    d=df[df["SUBDIVISION"]==subdivision]
    d=d[["YEAR","ANNUAL"]]
    popt, _ = curve_fit(objective, range(d["YEAR"].shape[0]), d["ANNUAL"])
    slope, b = popt
    return slope
    
def all_states(df):
    a=[]
    for i in df.SUBDIVISION.unique():
        a.append([i,decrease(df,i)])
        
    
    d=pd.DataFrame(a,columns=["SUBDIVISION","increase in rainfall (mm/year)"])
    
    return d

Change_rainfall=all_states(df)
def highlight_greaterthan(s, column):
        is_max = pd.Series(data=False, index=s.index)
        is_max[column] = s.loc[column]>0
        return ['background-color: #e6ffe6' if is_max.any() else 'background-color: #ffe6e6' for v in is_max]
display(Change_rainfall.style.apply(highlight_greaterthan, column=['increase in rainfall (mm/year)'], axis=1))

## plotting Increase/Decrease for all Subdivisions

In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
def vis_change(df,Change_rainfall):
    d=Change_rainfall.merge(df[["SUBDIVISION","Latitude","Longitude"]],on="SUBDIVISION",how="inner").drop_duplicates(subset="SUBDIVISION",keep="last").reset_index(drop=True)
    colors = np.where(d["increase in rainfall (mm/year)"]<0, 'C0', 'C1')

    d.plot(kind='scatter', x='Longitude', y='Latitude', alpha=1
                      , figsize=(10,7),c=colors.ravel()
                      ,s=90)
    
    plt.imshow(img, extent=[67.4, 98, 6, 38], alpha=1)
    plt.ylabel("Latitude", fontsize=14)
    plt.xlabel("Longitude", fontsize=14)
    plt.legend() 
    plt.show()

vis_change(df,Change_rainfall)

**Blue represent States which have a positive slope (increase in rainfall), red represent location with negative slope**

## Using SVM to draw a decision boundary between the 2 classes

Fitting a polynomial decision boundary between states that have a positive slope and negative slope.

In [1]:
from sklearn.svm import SVC
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets

d=Change_rainfall.merge(df[["SUBDIVISION","Latitude","Longitude"]],on="SUBDIVISION",how="inner").drop_duplicates(subset="SUBDIVISION",keep="last").reset_index(drop=True)
d["class"]=1
d.loc[d["increase in rainfall (mm/year)"]<0,"class"]=0
# Select 2 features / variable for the 2D plot that we are going to create.
X = d[["Longitude","Latitude"]].values
y = d["class"].values

def make_meshgrid(x, y, h=.02):
    x_min, x_max = 67.4,98
    y_min, y_max = 6, 38
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

model = svm.SVC(kernel='poly')
clf = model.fit(X, y)

fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of linear SVC ')
# Set-up grid for plotting.
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
plt.imshow(img, extent=[67.4, 98, 6, 38], alpha=1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('y label here')
ax.set_xlabel('x label here')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()

## Clustering to find monsoon months
Using k-means clustering to find 3 clusters. The cluster with the max rainfall month will represent the months in monsoon season.

In [1]:
from sklearn.cluster import KMeans
months=[x for x in df.columns if len(x)==3]
df.loc[:,~df.columns.str.contains("-")]

def class_names(d):
    m=d["rainfall"].idxmax()
    mc=d.at[m,"class"]
    mon=d.at[m,"month"]
    d.loc[d["class"]==mc,"class"]="monsoon"
    d.loc[(d["month"]<mon) & (d["class"]!="monsoon"),"class"]="pre monsoon"
    d.loc[(d["month"]>mon) & (d["class"]!="monsoon"),"class"]="post monsoon"
    return d
    
def month_trend(year,df,subdivision):
    months=[x for x in df.columns if len(x)==3]
    d=df[(df["YEAR"]==year) & (df["SUBDIVISION"]==subdivision)]
    d=(d[months].T)
    d.columns=["rainfall"]
    d["month"]=list(range(1,13))
    d1=[list(d["rainfall"]),list(d["month"])]
    d1=np.array(d)
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(d1)
    y_kmeans = kmeans.predict(d)
    d["class"]=y_kmeans
    d=class_names(d)
    sns.scatterplot(data=d, x="month", y="rainfall",linewidth = 3,color='r',s=100,alpha=1,hue="class")

month_trend(2003,df,"Kerala")