In [94]:
import numpy as np 
import pandas as pd
import plotly.express as px 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime 


In [95]:
## Reading The data.
df = pd.read_csv("/content/Weather Data in India from 1901 to 2017.csv")

In [96]:
df.head()
df.describe()

Unnamed: 0.1,Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
count,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0
mean,58.0,1959.0,18.423248,20.137949,23.434188,26.514103,28.38641,28.30094,27.369231,26.940085,26.34265,24.742051,21.765726,19.173333
std,33.919021,33.919021,0.612963,0.900944,0.862825,0.75074,0.644678,0.460603,0.34592,0.348876,0.387789,0.563152,0.634183,0.635912
min,0.0,1901.0,17.25,17.79,21.78,24.84,26.97,27.33,26.48,26.21,25.47,23.52,20.59,17.98
25%,29.0,1930.0,18.07,19.47,22.84,26.0,27.95,28.02,27.15,26.73,26.11,24.39,21.32,18.78
50%,58.0,1959.0,18.35,19.99,23.33,26.54,28.34,28.25,27.34,26.9,26.31,24.67,21.72,19.18
75%,87.0,1988.0,18.72,20.55,23.93,26.91,28.82,28.58,27.53,27.15,26.5,24.92,22.15,19.55
max,116.0,2017.0,20.92,23.58,26.61,29.56,30.78,29.88,28.47,28.17,28.11,27.24,23.92,21.89


In [97]:
df = pd.read_csv("/content/Weather Data in India from 1901 to 2017.csv", index_col=0)

In [98]:
df.head()

Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,1901,17.99,19.43,23.49,26.41,28.28,28.6,27.49,26.98,26.26,25.08,21.73,18.95
1,1902,19.0,20.39,24.1,26.54,28.68,28.44,27.29,27.05,25.95,24.37,21.33,18.78
2,1903,18.32,19.79,22.46,26.03,27.93,28.41,28.04,26.63,26.34,24.57,20.96,18.29
3,1904,17.77,19.39,22.95,26.73,27.83,27.85,26.84,26.73,25.84,24.36,21.07,18.84
4,1905,17.4,17.79,21.78,24.84,28.32,28.69,27.67,27.47,26.29,26.16,22.07,18.71


In [99]:
df1 = pd.melt(df, id_vars='YEAR', value_vars=df.columns[1:]) 
df1.head() 

Unnamed: 0,YEAR,variable,value
0,1901,JAN,17.99
1,1902,JAN,19.0
2,1903,JAN,18.32
3,1904,JAN,17.77
4,1905,JAN,17.4


In [100]:
df1['Date'] = df1['variable'] + ' ' + df1['YEAR'].astype(str)  
df1.loc[:,'Date'] = df1['Date'].apply(lambda x : datetime.strptime(x, '%b %Y')) 
df1.head()

Unnamed: 0,YEAR,variable,value,Date
0,1901,JAN,17.99,1901-01-01
1,1902,JAN,19.0,1902-01-01
2,1903,JAN,18.32,1903-01-01
3,1904,JAN,17.77,1904-01-01
4,1905,JAN,17.4,1905-01-01


## Temperature throught time.

<a class="anchor" id="2"></a>

In [101]:
df1.columns=['Year', 'Month', 'Temperature', 'Date']
df1.sort_values(by='Date', inplace=True)
fig = go.Figure(layout = go.Layout(yaxis=dict(range=[0, df1['Temperature'].max()+1])))
fig.add_trace(go.Scatter(x=df1['Date'], y=df1['Temperature']), )
fig.update_layout(title='Temperature Throught Timeline:',
                 xaxis_title='Time', yaxis_title='Temperature in Degrees')
fig.update_layout(xaxis=go.layout.XAxis(
    rangeselector=dict(
        buttons=list([dict(label="Whole View", step="all"),
                      dict(count=1,label="One Year View",step="year",stepmode="todate")                      
                     ])),
        rangeslider=dict(visible=True),type="date")
)
fig.show()

In [102]:
fig = px.box(df1, 'Month', 'Temperature')
fig.update_layout(title='Warmest, Coldest and Median Monthly Tempratue.')
fig.show()

In [103]:
from sklearn.cluster import KMeans
sse = []
target = df1['Temperature'].to_numpy().reshape(-1,1)
num_clusters = list(range(1, 10))

for k in num_clusters:
    km = KMeans(n_clusters=k)
    km.fit(target)
    sse.append(km.inertia_)

fig = go.Figure(data=[
    go.Scatter(x = num_clusters, y=sse, mode='lines'),
    go.Scatter(x = num_clusters, y=sse, mode='markers')
])

fig.update_layout(title="Evaluation on number of clusters:",
                 xaxis_title = "Number of Clusters:",
                 yaxis_title = "Sum of Squared Distance",
                 showlegend=False)
fig.show()

- A cluster size of 3 seems a good choice here

In [104]:
km = KMeans(3)
km.fit(df1['Temperature'].to_numpy().reshape(-1,1))
df1.loc[:,'Temp Labels'] = km.labels_
fig = px.scatter(df1, 'Date', 'Temperature', color='Temp Labels')
fig.update_layout(title = "Temperature clusters.",
                 xaxis_title="Date", yaxis_title="Temperature")
fig.show()

In [105]:
fig = px.histogram(x=df1['Temperature'], nbins=200, histnorm='density')
fig.update_layout(title='Frequency chart of temperature readings:',
                 xaxis_title='Temperature', yaxis_title='Count')

## Yearly average temperature.

<a class="anchor" id="4"></a>

In [106]:
df['Yearly Mean'] = df.iloc[:,1:].mean(axis=1) 
fig = go.Figure(data=[
    go.Scatter(name='Yearly Temperatures' , x=df['YEAR'], y=df['Yearly Mean'], mode='lines'),
    go.Scatter(name='Yearly Temperatures' , x=df['YEAR'], y=df['Yearly Mean'], mode='markers')
])
fig.update_layout(title='Yearly Mean Temperature :',
                 xaxis_title='Time', yaxis_title='Temperature in Degrees')
fig.show()

fig = px.scatter(df,x = 'YEAR', y = 'Yearly Mean', trendline = 'lowess')
fig.update_layout(title='Trendline Over The Years :',
                 xaxis_title='Time', yaxis_title='Temperature in Degrees')
fig.show()

## Monthly tempratues throught history.

<a class="anchor" id="5"></a>

In [107]:
fig = px.line(df1, 'Year', 'Temperature', facet_col='Month', facet_col_wrap=4)
fig.update_layout(title='Monthly temperature throught history:')
fig.show()

## Seasonal Analysis

<a class="anchor" id="6"></a>

In [108]:
df['Winter'] = df[['DEC', 'JAN', 'FEB']].mean(axis=1)
df['Summer'] = df[['MAR', 'APR', 'MAY']].mean(axis=1)
df['Monsoon'] = df[['JUN', 'JUL', 'AUG', 'SEP']].mean(axis=1)
df['Autumn'] = df[['OCT', 'NOV']].mean(axis=1)
seasonal_df = df[['YEAR', 'Winter', 'Summer', 'Monsoon', 'Autumn']]
seasonal_df = pd.melt(seasonal_df, id_vars='YEAR', value_vars=seasonal_df.columns[1:])
seasonal_df.columns=['Year', 'Season', 'Temperature']

In [109]:
fig = px.scatter(seasonal_df, 'Year', 'Temperature', facet_col='Season', facet_col_wrap=2, trendline='ols')
fig.update_layout(title='Seasonal mean temperatures throught years:')
fig.show()

## Forecasting

<a class='ancor' id='7'></a>

In [110]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score 
from sklearn import tree

df2 = df1[['Year', 'Month', 'Temperature']].copy()
df2 = pd.get_dummies(df2)
y = df2[['Temperature']]
x = df2.drop(columns='Temperature')

dtr = DecisionTreeRegressor()
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.3)
dtr.fit(train_x, train_y)
pred = dtr.predict(test_x)
r2_score(test_y, pred)


0.9610398579790628

In [116]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score 
from sklearn.linear_model import LinearRegression
df2 = df1[['Year', 'Month', 'Temperature']].copy()
df2 = pd.get_dummies(df2)
y = df2[['Temperature']]
x = df2.drop(columns='Temperature')

lr = LinearRegression()
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.3)
lr.fit(train_x, train_y.values.ravel())
pred = lr.predict(test_x)
r2_score(test_y, pred)



0.9791292104526365

In [115]:
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score 
from sklearn import linear_model
df2 = df1[['Year', 'Month', 'Temperature']].copy()
df2 = pd.get_dummies(df2)
y = df2[['Temperature']]
x = df2.drop(columns='Temperature')

rcv = RidgeCV()
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.3)
rcv.fit(train_x, train_y.values.ravel())
pred = rcv.predict(test_x)
r2_score(test_y, pred)



0.9726979425904574

In [113]:
next_Year = df1[df1['Year']==2017][['Year', 'Month']]
next_Year.Year.replace(2017,2018, inplace=True)
next_Year= pd.get_dummies(next_Year)
temp_2018 = lr.predict(next_Year)

temp_2018 = {'Month':df1['Month'].unique(), 'Temperature':temp_2018}
temp_2018=pd.DataFrame(temp_2018)
temp_2018['Year'] = 2018
temp_2018

Unnamed: 0,Month,Temperature,Year
0,JAN,18.952578,2018
1,FEB,20.757259,2018
2,MAR,23.931497,2018
3,APR,27.031065,2018
4,MAY,28.92184,2018
5,JUN,28.793355,2018
6,JUL,27.876338,2018
7,AUG,27.434676,2018
8,SEP,26.813531,2018
9,OCT,25.227798,2018


In [114]:
forecasted_temp = pd.concat([df1,temp_2018], sort=False).groupby(by='Year')['Temperature'].mean().reset_index()
fig = go.Figure(data=[
    go.Scatter(name='Yearly Mean Temperature', x=forecasted_temp['Year'], y=forecasted_temp['Temperature'], mode='lines'),
    go.Scatter(name='Yearly Mean Temperature', x=forecasted_temp ['Year'], y=forecasted_temp['Temperature'], mode='markers')
])
fig.update_layout(title='Forecasted Temperature:',
                 xaxis_title='Time', yaxis_title='Temperature in Degrees')
fig.show()