# Import Libraries / Load Data

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from mpl_toolkits.basemap import Basemap
warnings.filterwarnings("ignore")

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
DATA_PATH = "/kaggle/input/rainfall-data-from-1901-to-2017-for-india/Rainfall_Data_LL.csv"
data = pd.read_csv(DATA_PATH)

# Basic Analysis

In [None]:
print(f"Data Shape: {data.shape}")

In [None]:
data.head()

In [None]:
data["SUBDIVISION"].value_counts()

In [None]:
data.columns

In [None]:
pd.Series(data["JAN"] + data["FEB"] == data["Jan-Feb"]).value_counts()

As you can see here, I wanted to know if the "Jan-Feb" column is exactly the sum of the january and febuary months. Interestingly, some meet this requirement while others do not. Let's dig deeper. Let's examine the impact of this finding.

In [None]:
data[data["JAN"] + data["FEB"] != data["Jan-Feb"]]

When we examined it in more detail, I saw that this column is the sum of these two months. It can be seen that there is a problem arising from the digits after the comma. But I decided not to use these features in my analysis. Instead, I will add new features myself.

## Average Annual Precipitation by Region

In [None]:
months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
data["annualMean"] = data[months].iloc[:].sum(axis = 1) / 12
groupedAnnualMean = data[["SUBDIVISION","annualMean"]].groupby("SUBDIVISION").mean().reset_index()
plt.figure(figsize = (15,15))
sns.barplot(x = "SUBDIVISION",
           y = "annualMean",
           data = groupedAnnualMean,
           palette = "viridis")
plt.xlabel("Regions")
plt.ylabel("Value")
plt.title("Annual Mean by Regions")
plt.xticks(rotation=90)
plt.show()
del groupedAnnualMean
gc.collect()

## Annual Average Change by Region

In [None]:
plt.figure(figsize = (30,20))
sns.lineplot(x = "YEAR",
             y = "annualMean",
            hue = "SUBDIVISION",
            data = data,
            palette = "viridis")#,
            #legend = False)
plt.xlabel("Years")
plt.ylabel("AnnualMeans")
plt.show()

## Rainfall by Regions and Months

In [None]:
data.columns

In [None]:
groupedByMonths = data[["SUBDIVISION"] + months].groupby("SUBDIVISION").mean().reset_index()
groupedByMonths = pd.melt(groupedByMonths,
                         id_vars = ["SUBDIVISION"],
                         value_vars = months,
                         var_name = "Months",
                         value_name = "MeanRaining")
plt.figure(figsize = (30,20))
sns.lineplot(x = "Months",
             y = "MeanRaining",
            hue = "SUBDIVISION",
            data = groupedByMonths,
            palette = "viridis",
            legend = False)
plt.xlabel("Months")
plt.ylabel("MonthlyMeans")
plt.show()

As seen in the visualization, while India generally starts to receive heavy rains in May, these rains decrease in October and even the rains end in some regions.

## Total Rain in 2017

In [None]:
subData = data[["ANNUAL", "Latitude", "Longitude"]]
my_dpi=96
plt.figure(figsize=(2600/my_dpi, 1800/my_dpi), dpi=my_dpi)


m=Basemap(llcrnrlon=67, llcrnrlat=8, urcrnrlon=90, urcrnrlat=40, resolution = None)  # India
m.shadedrelief()
#m.drawmapboundary(fill_color='#A6CAE0', linewidth=0)
#m.fillcontinents(color='grey', alpha=0.3)
#m.drawcoastlines(linewidth=0.1, color="white")

m.scatter(subData['Longitude'], subData['Latitude'], s=subData['ANNUAL'], alpha=0.01, cmap="Set1")
plt.show()

If we examine this map, we can conclude that southwestern India receives more precipitation than other regions.

I wanted to superficially analyze the rains of India. I will add even more in-depth analysis and visualizations in the future.