In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> In this notebook we will se the exploratory data analysis of the rainfall in Pakistan in last 115 years.

**Importing the libraries to be used in the EDA processing**

In [None]:
# linear algebra
import numpy as np 
import math

# data processing
import pandas as pd

# data visualization(for EDA)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
sns.set(color_codes=True)
import plotly.express as px
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

import calendar 

**Loading the rainfall dataset**

In [None]:
df=pd.read_csv('../input/rainfall-in-pakistan/Rainfall_1901_2016_PAK.csv')
df.head()


1. creating a Date column using the year and months column provided and setting it as the index column of our dataframe
2. re-naming the Rainfall - (MM) column and Year column

In [None]:
#creating index column
df['Date']=pd.to_datetime(df[' Year'].astype(str)  + df['Month'], format='%Y%B').dt.to_period('m')
df = df.set_index('Date')
df = df.rename(columns = {'Rainfall - (MM)':'Rainfall',' Year':'Year'})
df.head()

**printing the information of our dataframe**

In [None]:
df.info

**checking null values in the dataframe**

In [None]:
df.isnull().sum()

There are no null null values in the dataset so there is no need for data cleaning and handling any missing values

# **Exploratory Data Analysis**

**Year wise distribution**

In [None]:
ax=df.groupby([df.Year]).mean()['Rainfall']
fig = px.line(ax, x=ax.index, y='Rainfall', title='Annual rainfall in Pakistan from 1901 to 2016')
fig.update_traces(mode='lines+markers',line=dict(color='Purple'))
fig.update(layout=dict(title=dict(x=0.5)))
fig.show()

From the above line graph, we can see that the highest average rainfall in Pakistan was recorded in the year 1944.

In [None]:
ax=df.groupby('Year').mean()['Rainfall'].rolling(10).mean()
fig = px.line(ax, x=ax.index, y='Rainfall', title='Rolling average of 10 years of Rainfall')
fig.update_traces(mode='lines+markers',line=dict(color='Grey'))
fig.update(layout=dict(title=dict(x=0.5)))
fig.show()

To analysis the progression level of rainfall, I took the rolling average of 10 years.

**Month wise box-plot distribution**

In [None]:
f,ax=plt.subplots(1,1,figsize=(30,10))
title = plt.title('Rainfall by Month', fontsize=20)
title.set_position([0.5, 1.05])

g = sns.boxplot(df["Month"], df["Rainfall"],ax=ax)

From the figure we can see that the majority of the rainfall is received in the months of July and August which is the monsoon season while in October and November the least Rainfall is recorded.

**Per-year monthly Rainfall statistics and heatmap**

In [None]:
df['Month_Num'] = df['Month'].apply(lambda x: {month: index for index, month in enumerate(calendar.month_name) if month}[x])
data_pivot = df.pivot("Year", "Month_Num", "Rainfall")
data_pivot

In [None]:
f,ax=plt.subplots(1,1,figsize=(30,10))
title = plt.title('Rainfall Heat Map', fontsize=20)
title.set_position([0.5, 1.05])
chart = sns.heatmap(data_pivot)
chart.set_xticklabels(("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), rotation=45, horizontalalignment='right')

**Per-month yearly Rainfall statistics and heatmap**

In [None]:
df['Month_Num'] = df['Month'].apply(lambda x: {month: index for index, month in enumerate(calendar.month_name) if month}[x])
data_pivot_2 = df.pivot("Month_Num", "Year", "Rainfall")
data_pivot_2

In [None]:
f,ax=plt.subplots(1,1,figsize=(30,10))
title = plt.title('Rainfall Heat Map 2', fontsize=20)
title.set_position([0.5, 1.05])
chart = sns.heatmap(data_pivot_2)
l = chart.set_yticklabels(("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), rotation=45, horizontalalignment='right')

From the above heatmap we can see that in all years, the most amount of rain was recorded in the months of July and August

**Monthly Rainfall Distribution using Bar Graphs**

In [None]:
f,ax=plt.subplots(1,2,figsize=(30,5))
f.suptitle('Distribution by Months', fontsize=14)

ax[0].set_title(calendar.month_name[1], fontsize=12)
df_month = df[df["Month_Num"] == 1]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[0])

ax[1].set_title(calendar.month_name[2], fontsize=12)
df_month = df[df["Month_Num"] == 2]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[1])

f,ax=plt.subplots(1,2,figsize=(30,5))

ax[0].set_title(calendar.month_name[3], fontsize=12)
df_month = df[df["Month_Num"] == 3]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[0])

ax[1].set_title(calendar.month_name[4], fontsize=12)
df_month = df[df["Month_Num"] == 4]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[1])

f,ax=plt.subplots(1,2,figsize=(30,5))

ax[0].set_title(calendar.month_name[5], fontsize=12)
df_month = df[df["Month_Num"] == 5]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[0])

ax[1].set_title(calendar.month_name[6], fontsize=12)
df_month = df[df["Month_Num"] == 6]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[1])

f,ax=plt.subplots(1,2,figsize=(30,5))

ax[0].set_title(calendar.month_name[7], fontsize=12)
df_month = df[df["Month_Num"] == 7]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[0])

ax[1].set_title(calendar.month_name[8], fontsize=12)
df_month = df[df["Month_Num"] == 8]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[1])

f,ax=plt.subplots(1,2,figsize=(30,5))

ax[0].set_title(calendar.month_name[9], fontsize=12)
df_month = df[df["Month_Num"] == 9]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[0])

ax[1].set_title(calendar.month_name[10], fontsize=12)
df_month = df[df["Month_Num"] == 10]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[1])

f,ax=plt.subplots(1,2,figsize=(30,5))

ax[0].set_title(calendar.month_name[11], fontsize=12)
df_month = df[df["Month_Num"] == 11]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[0])

ax[1].set_title(calendar.month_name[12], fontsize=12)
df_month = df[df["Month_Num"] == 12]
g = sns.histplot(data=df_month, x="Rainfall", binwidth=10, ax=ax[1])    

**Season Wise Rainfall Distribution**

Pakistan has four seasons: winter from December through February; spring from March through May; the summer rainy season, or southwest monsoon period, from June through September; and the Autumn period is in October and November

In [None]:
winter=df.query('Month=="December" or Month=="January" or Month=="February"').groupby([df.Year]).mean()['Rainfall']
spring=df.query('Month=="March"or Month=="April"').groupby([df.Year]).mean()['Rainfall']
summer=df.query('Month=="May" or Month=="June" or Month=="July" or Month=="August"or Month=="September"').groupby([df.Year]).mean()['Rainfall']
Autumn=df.query('Month=="October" or Month=="November"').groupby([df.Year]).mean()['Rainfall']
data=pd.DataFrame({ 'Winter': winter, 'Spring': spring,'Summer': summer, 'Autumn': Autumn })
data.plot(figsize=(17,8));
plt.title('Seasonal Rainfall in Pakistan from 1901 to 2016',fontsize=20);

 From the graph, it is clear that Pakistan received more rainfall in the summer and spring seasons.

In [None]:
y=data.mean()
x=data.columns
fig = px.bar(x=x,y=y,color=x,title='Season wise Rainfall in Pakistan')
fig.update(layout=dict(title=dict(x=0.5)))
fig.show()

> Here we can visualize that Pakistan receives the large amount of rains in Summer season and least in the Autumn season

In [None]:
ax=df.groupby([df.index.year]).agg({'Rainfall':sum})
print('The largest amount of rain was recorded in the following years')
ax['Rainfall'].nlargest(5)