In [None]:

import numpy as np
import pandas as pd 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head()

# Task : The most min temp

In [None]:
print("Minimum Temperature   "+str(df['MinTemp'].min()))
df.groupby('Location',sort = False)['MinTemp'].min().nsmallest()

# Task : Most max temp

In [None]:
print("Maximum Temperature   "+str(df['MaxTemp'].max()))
df.groupby('Location',sort = False)['MaxTemp'].max().nlargest()

# Task : Largest amount of rainfall

In [None]:
print("Highest Rainfall      "+str(df['Rainfall'].max()))
df.groupby('Location',sort = False)['Rainfall'].max().nlargest()

In [None]:
# Let's see % of missing values

df.isnull().mean()

# Data Cleaning

In [None]:
# Missing values % in the features Evaporation=43%, Sunshine 48%, Cloud9am=38%, Cloud3pm=40% 
# This are to much missing value, I'm dropping this features.

df=df.drop(['Evaporation','Sunshine','Cloud9am','Cloud3pm'], axis = 1)

In [None]:
# Let seperate Categories and numerical features

df_cat=df[['WindGustDir','WindDir9am','WindDir3pm','RainToday','RainTomorrow','Date','Location']]

df_num=df.drop(['WindGustDir','WindDir9am','WindDir3pm','RainToday','RainTomorrow','Date','Location'], axis = 1)

In [None]:
# Every location has different windspeed, direction, Temperature and Pressure  
# Replacing Categories features with most frequent value based on location 

for col in df_cat.columns.values:
    if df[col].isnull().sum() == 0:
        continue
    df_cat[col] = df.groupby(['Location'])[col].apply(lambda x: x.fillna(x.mode().max()))

In [None]:
# Still we have missing value for WindGustDir because for few locations we have no values 

df_cat.isnull().mean()

In [None]:
# We replace this values with the mode of complete dataset

df_cat['WindGustDir']=df['WindGustDir'].fillna(df['WindGustDir'].mode().max())

In [None]:
# Replacing Numerical features with mean value based on location same as Categories

for col in df_num.columns.values:
    if df[col].isnull().sum() == 0:
        continue
    df_num[col] = df.groupby(['Location'])[col].apply(lambda x: x.fillna(x.mean()))

In [None]:
# This has same problem as df_cat, We will replace is mean value of dataset

df_num['WindGustSpeed']=df_num['WindGustSpeed'].fillna(df['WindGustSpeed'].mean())
df_num['Pressure9am']=df_num['Pressure9am'].fillna(df['Pressure9am'].mean())
df_num['Pressure3pm']=df_num['Pressure3pm'].fillna(df['Pressure3pm'].mean())


In [None]:
d={'Yes':1,'No':0}
df_cat['RainTomorrow']=df_cat['RainTomorrow'].map(d)
df_cat['RainToday']=df_cat['RainToday'].map(d)

In [None]:
df_cat2=df_cat[['WindGustDir','WindDir9am','WindDir3pm','Location']]

#Replacing Categories value with value counts

df_cat2['Location']=df_cat2['Location'].map(df_cat2['Location'].value_counts())
df_cat2['WindGustDir']=df_cat2['WindGustDir'].map(df_cat2['WindGustDir'].value_counts())
df_cat2['WindDir9am']=df_cat2['WindDir9am'].map(df_cat2['WindDir9am'].value_counts())
df_cat2['WindDir3pm']=df_cat2['WindDir3pm'].map(df_cat2['WindDir3pm'].value_counts())

In [None]:
df_n=pd.merge(df_num, df_cat2, left_index=True, right_index=True)

In [None]:
# Using Standar Scaler to scaled

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_n)
df_scaled = pd.DataFrame(scaler.fit_transform(df_n),columns = df_n.columns)

In [None]:
df_x=pd.merge(df_scaled, df_cat['RainToday'],left_index=True, right_index=True)

In [None]:
df_x.hist(bins=50, figsize=(20, 10))
plt.show()

In [None]:
# Let see the correlation

plt.figure(figsize=(20,10))
heatmap = sns.heatmap(df_x.corr(), vmin=-1, vmax=1, annot=True)

In [None]:
# Teamp9am(89%) and Temp3pm(98%) has high correlation with MaxTemp

df_x.drop('Temp9am',axis=1,inplace=True)
df_x.drop('Temp3pm',axis=1,inplace=True)

In [None]:

from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df_x, df_cat['RainTomorrow'], test_size=0.2, random_state=50)

In [None]:
#As we can see we are dealing with big imbalance dataset, We need to perform oversampling

df_cat['RainTomorrow'].value_counts().plot(kind='barh')

In [None]:
# Oversampled

from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 37) 
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
from collections import Counter

print("Before {}".format(Counter(y_train)))
print("After {}".format(Counter(y_train_res)))

In [None]:
# XGBoost

from sklearn import datasets, linear_model, metrics 
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=10,random_state = 37)
model.fit(X_train_res, y_train_res)
model.score(X_train_res, y_train_res)

y_pred = model.predict(X_test)

In [None]:
print('Confusion matrix \n {}'.format(confusion_matrix(y_test,y_pred)))
print('Accuracy score {:.2f}'.format(accuracy_score(y_test,y_pred)*100))
print(classification_report(y_test,y_pred))

# Task : Update data up to 2020

In [None]:
from fbprophet import Prophet

m = Prophet()

df_for=df[['Date','Rainfall']]
df_for['Date']=pd.to_datetime(df_for['Date'])
df_for.rename(columns = {'Date':'ds'}, inplace = True)
df_for.rename(columns = {'Rainfall':'y'}, inplace = True)
m.fit(df_for)

In [None]:
future = m.make_future_dataframe(periods=1285)
future.tail()

In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
from fbprophet.plot import add_changepoints_to_plot
fig = m.plot(forecast)
a = add_changepoints_to_plot(fig.gca(), m, forecast)