In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head()

In [None]:
#Making a copy of the dataframe loaded
df_copy=df.copy()

In [None]:
print('Rows:',df.shape[0])
print('Columns:',df.shape[1])

In [None]:
df.info()

1. Categorical variables:
Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
2. Numerical variables:  MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9pm,Humidty3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RISK_MM

In [None]:
df.describe()

1. Right skewed: MinTemp,MaxTemp,Rainfall,Evaporation,WindGustSpeed,WindSpeed9am,RISK_MM
2. Left Skewed: Sunshine,WindSpeed3pm,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm
3. Slighlty left skewed: Pressure9am
4. Slightly right skewed: Pressure3pm,Temp9am,Temp3pm
5. Standard deviation is the highest for Humidity3pm column. This factor to be considered while choosing the features for modelling.

In [None]:
df.describe(include='object')

The most frequent location is 'Canberra' occuring 3418 times.
The most frequent value for the RainToday and RainTomorrow column is 'No'.
The most frequent WindDir3pm is 'SE' occuring 10663.
The most frequent value for the WindDir9am' is 'N'.

In [None]:
df['WindDir9am'].value_counts(normalize=True).plot(kind='bar')

Mostly the wind direction at 9am is towards N aound 8% of the time.

In [None]:
df['WindDir3pm'].value_counts(normalize=True).plot(kind='bar')

Mostly the wind direction at 3pm is towards SE about 8%.

In [None]:
df['WindGustDir'].value_counts(normalize=True).plot(kind='bar')

Most of the strong winds in a given period of time is towards the West direction.

In [None]:
#Checking for missing values
df.isnull().sum()

In [None]:
# Missing value impuatation
df['MinTemp']=df['MinTemp'].fillna(df['MinTemp'].median())
df['MaxTemp']=df['MaxTemp'].fillna(df['MaxTemp'].median())
df['Rainfall']=df['Rainfall'].fillna(df['Rainfall'].median())
df['Evaporation']=df['Evaporation'].fillna(method='ffill')
df['Sunshine']=df['Sunshine'].fillna(method='ffill')
df['WindDir9am']=df['WindDir9am'].fillna(method='ffill')
df['WindDir3pm']=df['WindDir3pm'].fillna(method='ffill')
df['Humidity3pm']=df['Humidity3pm'].fillna(df['Humidity3pm'].median())
df['Humidity9am']=df['Humidity9am'].fillna(df['Humidity9am'].median())
df['Pressure3pm']=df['Pressure3pm'].fillna(method='ffill')
df['Pressure9am']=df['Pressure9am'].fillna(method='ffill')
df['Cloud3pm']=df['Cloud3pm'].fillna(method='ffill')
df['Cloud9am']=df['Cloud9am'].fillna(method='ffill')
df['Temp9am']=df['Temp9am'].fillna(df['Temp9am'].median())
df['Temp3pm']=df['Temp3pm'].fillna(df['Temp3pm'].median())
df['Evaporation']=df['Evaporation'].fillna(df['Evaporation'].median())
df['Sunshine']=df['Sunshine'].fillna(df['Sunshine'].median())
df['WindGustSpeed']=df['WindGustSpeed'].fillna(df['WindGustSpeed'].median())
df['WindSpeed3pm']=df['WindSpeed3pm'].fillna(df['WindSpeed3pm'].median())
df['WindSpeed9am']=df['WindSpeed9am'].fillna(df['WindSpeed9am'].median())
df['RainToday']=df['RainToday'].fillna('No')
df['WindGustDir']=df['WindGustDir'].fillna('W')

In [None]:
#Checking for missing values again
df.isnull().sum()

In [None]:
df['Cloud3pm']=df['Cloud3pm'].fillna(df['Cloud3pm'].median())

In [None]:
#Checking for missing values again
df.isnull().sum()

In [None]:
#checking for outliers
df.plot(kind='box',subplots=True,layout=(9,2),figsize=(16,12))

We will apply appropriate transforms to reduce the outliers.

# Treating outliers using Capping and Flooring technique

In [None]:
# Detecting outlier's limit(MaxTemp)
#IQR=Q3-Q1
iqr=28.2-17.9
max_u=28.2+1.5*(iqr)
max_l=17.9-1.5*(iqr)
print('Upper Limit:',max_u)
print('Lower Limit:',max_l)

In [None]:
# Capping Values greater than 43.65 to 43.65 and lesser than 2.45 to 2.45
df.loc[df.MaxTemp>43.65,'MaxTemp'] = 43.65

In [None]:
df.loc[df.MaxTemp<2.45,'MaxTemp'] = 2.45

In [None]:
# Detecting outlier's limit(MinTemp)
iqr=16.8-7.6
min_u=16.8+1.5*(iqr)
min_l=7.6-1.5*(iqr)
print('Upper Limit:',min_u)
print('Lower Limit:',min_l)

In [None]:
# Capping Values greater than 30.6 to 30.6 and lesser than -6.2 to -6.2
df.loc[df.MinTemp>30.6,'MinTemp'] = 30.6
df.loc[df.MinTemp<-6.2,'MinTemp'] = -6.2

In [None]:
# Detecting outlier's limit(RainFall)
iqr=0.8-0
rain_u=0.8+1.5*(iqr)
rain_l=0-1.5*(iqr)
print('Upper Limit:',rain_u)
print('Lower Limit:',rain_l)

In [None]:
# Capping Values greater than 2.0 to 2.0 and lesser than -1.2 to -1.2
df.loc[df.Rainfall>2.0,'Rainfall'] = 2.0
df.loc[df.Rainfall<-1.2,'Rainfall'] = -1.2

In [None]:
# Detecting outlier's limit(Evaporation)
iqr=7.4-2.6
ev_u=7.4+1.5*(iqr)
ev_l=2.6-1.5*(iqr)
print('Upper Limit:',ev_u)
print('Lower Limit:',ev_l)

In [None]:
# Capping Values greater than 14.6 to 14.6 and lesser than -4.6 to -4.6
df.loc[df.Evaporation>14.6,'Evaporation'] = 14.6
df.loc[df.Evaporation<-1.2,'Evaporation'] = -1.2

In [None]:
# Detecting outlier's limit(WindGustSpeed)
iqr=48-31
wg_u=48+1.5*(iqr)
wg_l=31-1.5*(iqr)
print('Upper Limit:',wg_u)
print('Lower Limit:',wg_l)

In [None]:
# Capping Values greater than 73.5 to 73.5 and lesser than 5.5 to 5.5
df.loc[df.WindGustSpeed>73.5,'WindGustSpeed'] = 73.5
df.loc[df.WindGustSpeed<5.5,'WindGustSpeed'] = 5.5

In [None]:
# Detecting outlier's limit(WindSpeed9am)
iqr=19-7
u=19+1.5*(iqr)
l=7-1.5*(iqr)
print('Upper Limit:',u)
print('Lower Limit:',l)

In [None]:
# Capping Values greater than 37.0 to 37.0 and lesser than -11.0 to -11.0
df.loc[df.WindSpeed9am>37.0,'WindSpeed9am'] = 37.0
df.loc[df.WindSpeed9am<-11.0,'WindSpeed9am'] = -11.0

In [None]:
# Detecting outlier's limit(WindSpeed3pm)
iqr=24.0-13.0
u=24.0+1.5*(iqr)
l=13.0-1.5*(iqr)
print('Upper Limit:',u)
print('Lower Limit:',l)

In [None]:
# Capping Values greater than 40.5 to 40.5 and lesser than -3.5 to -3.5
df.loc[df.WindSpeed3pm>40.5,'WindSpeed3pm'] = 40.5
df.loc[df.WindSpeed3pm<-3.5,'WindSpeed3pm'] = -3.5

In [None]:
# Detecting outlier's limit(humidity9am)
iqr=83.0-57.0
u=83.0+1.5*(iqr)
l=57.0-1.5*(iqr)
print('Upper Limit:',u)
print('Lower Limit:',l)

In [None]:
# Capping Values 
df.loc[df.Humidity9am>122.0,'Humidity9am'] = 122.0
df.loc[df.Humidity9am<18.0,'Humidity9am'] = 18.0

In [None]:
# Detecting outlier's limit(Pressure9am)
iqr=1022.4-1012.9
u=1022.4+1.5*(iqr)
l=1012.9-1.5*(iqr)
print('Upper Limit:',u)
print('Lower Limit:',l)

In [None]:
# Capping Values 
df.loc[df.Pressure9am>1036.65,'Pressure9am'] = 1036.65
df.loc[df.Pressure9am<998.65,'Pressure9am'] = 998.65

In [None]:
# Detecting outlier's limit(Pressure3pm)
iqr=1020.0-1010.4
u=1020.0+1.5*(iqr)
l=1010.4-1.5*(iqr)
print('Upper Limit:',u)
print('Lower Limit:',l)

In [None]:
# Capping Values 
df.loc[df.Pressure3pm>1034.4,'Pressure3pm'] = 1034.4
df.loc[df.Pressure3pm<996.0,'Pressure3pm'] = 996.0

In [None]:
# Detecting outlier's limit(Temperature9am)
iqr=21.6-12.3
u=21.6+1.5*(iqr)
l=12.3-1.5*(iqr)
print('Upper Limit:',u)
print('Lower Limit:',l)

In [None]:
# Capping Values 
df.loc[df.Temp9am>35.55,'Temp9am'] = 35.55
df.loc[df.Temp9am<-1.65,'Temp9am'] = -1.65

In [None]:
# Detecting outlier's limit(Temperature3pm)
iqr=26.4-16.6
u=26.4+1.5*(iqr)
l=16.6-1.5*(iqr)
print('Upper Limit:',u)
print('Lower Limit:',l)

In [None]:
# Capping Values 
df.loc[df.Temp3pm>41.1,'Temp3pm'] = 41.1
df.loc[df.Temp3pm<1.9,'Temp3pm'] = 1.9

In [None]:
#Using boxcox for RISK_MM
from scipy.stats import boxcox
l=list((boxcox(df.RISK_MM+1)[0]))
df['RISK_MM']=l

In [None]:
df.plot(kind='box',subplots=True,layout=(9,2),figsize=(16,12))

> We observe that the outliers have reduced significantly after treating them.

In [None]:
#We will convert the target variables to label 0 and 1.
df['RainTomorrow'].replace({'Yes':'1','No':'0'},inplace=True)
df['RainTomorrow']=df['RainTomorrow'].astype(int)

In [None]:
#checking for correlation
plt.figure(figsize=(12,10))
corr=df.corr()
sns.heatmap(corr,annot=True)

1. We observe that the highest correlation for our target variable exists with the 'RISK_MM' column.
2. The second highest correlation for our target variable exists with the 'Humidity3pm' column.
3. Highest negative correlation is observed with Temp3pm.

In [None]:
pd.crosstab(df['RainToday'],df['RainTomorrow']).plot(kind='bar')

We observe a slight correlation between the variables 'RainToday' and 'RainTomorrow'.
If there is no RainToday there is a high chance that there will be no rain tomorrow also.

In [None]:
sns.scatterplot(x='RISK_MM',y='RainTomorrow',data=df)

As the RISK_MM increases the chances of getting rain also increases.

In [None]:
sns.scatterplot(x='Humidity3pm',y='RainTomorrow',data=df)

When humidity is very less then the chances of getting rain tomorrow is very less.

In [None]:
sns.scatterplot(x='Cloud9am',y='RainTomorrow',data=df)

In [None]:
sns.scatterplot(x='Cloud3pm',y='RainTomorrow',data=df)

We observe no relationship between them. We can consider dropping the column Clod9am and Clod3pm.

So based on our observations we will drop the columns Clod9am and Clod3pm.
We will drop the Location and Date column as well as it is not very much relevant for our analysis.

In [None]:
l=['Cloud9am','Cloud3pm','Date','Location']
df.drop(l,axis=1,inplace=True)

In [None]:
#Creating dummies
df=pd.get_dummies(data=df,columns=['WindGustDir','WindDir9am','WindDir3pm','RainToday'])

In [None]:
#Creating a copy of our dataframe
df_new=df.copy()

In [None]:
# Defining X and y
X=df.drop('RainTomorrow',axis=1)
y=df['RainTomorrow']

In [None]:
#Train and test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

In [None]:
#Model Building
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
y_train_pred=lr.predict(X_train)
y_test_pred=lr.predict(X_test)
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,roc_auc_score,roc_curve
print('Train Accuracy',accuracy_score(y_train,y_train_pred))
print('Test Accuracy',accuracy_score(y_test,y_test_pred))

The model seems to perform well. There is no problem of overfitting also.

In [None]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(y_test,y_test_pred)

- cohen_kappa_score is a metric to establish the relevancy between the predicted values and actual values.
- A Cohen Kappa Score of 1 indicates that there is a perfect aggreement. But anything above 0 is a desirable value.


In [None]:
print('Confusion Matrix(test)',confusion_matrix(y_train,y_train_pred))
cm=confusion_matrix(y_test,y_test_pred)
tn=cm[0,0]
tp=cm[1,1]
fn=cm[1,0]
fp=cm[0,1]
accuracy=(tp+tn)/(tp+tn+fp+fn)
misclassification_error=1-accuracy
sensitivity=tp/float(tp+fn)
specificity=tn/float(tn+fp)

In [None]:
print('The misclassifiction error is',misclassification_error)

In [None]:
print('Negative Liklihood ratio is',(1-sensitivity)/specificity)

In [None]:
print('Positive Liklihood ratio is',sensitivity/(1-specificity))