In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![Rain In Australia](https://res-3.cloudinary.com/the-university-of-melbourne/image/upload/s---y185jhN--/c_limit,f_auto,q_75,w_1784/v1/pursuit-uploads/4be/f69/882/4bef6988217e9f3be436803052345f9b7fc2752087fa9e4d56c3dc600c07.jpg)

# Problem Statement
dataset is provided with data to predict the rain in Australia. our aim in this notebook to do the EDA with the data set provided. do feature engnineering, feature analysis and create a machinlearning model to predict the Rain.

# Read Data

In [None]:
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

# Exploratory Data Analysis

Initial level analysis to understand the data and data type.

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.select_dtypes('object').columns

There are 7 Categorical data, and i belive Date columns must be data object for our detailed analysis

In [None]:
df.select_dtypes('float64').columns

There are 16 columns with Integer values. 

# Ploting

In [None]:
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df['Date']=pd.to_datetime(df['Date'],format='%Y-%m-%d')

In [None]:
df['Year']=df['Date'].dt.year
df['Month']=df['Date'].dt.month
df['day']=df['Date'].dt.day

# Explore Categorical Datatype

In [None]:
cat_columns = df.select_dtypes('object').columns.to_list()

In [None]:
cat_columns

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday','RainTomorrow']  are the categorical values. we need to convert the categorical features with Onehot encoder or binary encoding techinque to convert it as numerical variables.

# Analysing Numerical features

In [None]:
#lets get the list of Nuemrical feature column list
num_cols = df.select_dtypes('number').columns.to_list()

In [None]:
num_cols

In [None]:
num_cols.remove('Year')
num_cols.remove('Month')
num_cols.remove('day')

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df[num_cols].corr(), annot=True)

In [None]:
df[['MinTemp','Temp3pm']].corr()['MinTemp'][1]

In [None]:
treshold=0.68
corr_cols=[]
for i in num_cols:
    
    for j in num_cols:
        if i == j:
            continue
        #print(df[[i,j]].corr()[j])
        if df[[i,j]].corr()[i][1] >= treshold:
            print("{} is highly coorelated with {} at {:.2f}".format(i,j,df[[i,j]].corr()[i][1] ))
            corr_cols.append(j)
        

Set of highly coorelated features

In [None]:
corr_cols=list(set(corr_cols))

lets find the linearity between the highly coorelated Variables

In [None]:
sns.pairplot(df[corr_cols], diag_kind='hist', kind='scatter')

Identify outliers

In [None]:
df[num_cols].describe()

From the above table we can see that the features like Rainfall, Evaporation, WindGustSpeed, WindSpeed9am, WindSpeed3pm has high outliers

In [None]:
px.box(df, x=['Rainfall', 'Evaporation', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm'])

It is clear that the above said features has the outliers. lets us use Interquaritle Range method to remove the Outliers

In [None]:
#Rainfall Feature
for i in ['Rainfall', 'Evaporation', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm']:
    IQR = df[i].quantile(0.75)-df[i].quantile(0.25)
    lower_fence=df[i].quantile(0.25)-(IQR*1.5)
    upper_fence=df[i].quantile(0.75)+(IQR*1.5)
    print("{} has the upper fence : {:0.2f} & lower fence : {:0.2f}".format(i,upper_fence,lower_fence))


# Feature Engineering
1. Remove outliers
2. Handel null values

Null Values in Numerical features

In [None]:
df[num_cols].isnull().sum()

In [None]:
#percentate of null values in the data set
df[num_cols].isnull().sum()/len(df)

in this case the numerical values has outliers which we need to deal with. so, let us fill the null values in numerical features with Median values.

In [None]:
df['MinTemp'].mode()[0]

In [None]:
df['MinTemp'].median()

In [None]:
for i in num_cols:
    df[i].fillna(df[i].median(), inplace=True)

In [None]:
df[num_cols].isnull().sum()

null values in the numerical columns has been removed. 

In [None]:
#lets work on the Categorical features
df[cat_columns].isnull().sum()/len(df)

In [None]:
df[cat_columns].isnull().sum()/len(df)

In [None]:
for i in cat_columns:
    df[i].fillna(df[i].mode()[0], inplace=True)

In [None]:
df[cat_columns].isnull().sum()

In [None]:
df.isnull().sum()

All Null values in the Dataset has been handled.

1. Rainfall has the upper fence : 2.00 & lower fence : -1.20
2. Evaporation has the upper fence : 14.60 & lower fence : -4.60
3. WindGustSpeed has the upper fence : 73.50 & lower fence : 5.50
4. WindSpeed9am has the upper fence : 37.00 & lower fence : -11.00
5. WindSpeed3pm has the upper fence : 40.50 & lower fence : -3.50

In [None]:
df1=df.copy()

In [None]:
df1['Rainfall']=df1['Rainfall'].apply(lambda x: np.where(x>2.00,2.00,x))
df1['Evaporation']=df1['Evaporation'].apply(lambda x: np.where(x>14.60,14.60,x))
df1['WindGustSpeed']=df1['WindGustSpeed'].apply(lambda x: np.where(x>73.50,73.50,x))
df1['WindSpeed9am']=df1['WindSpeed9am'].apply(lambda x: np.where(x>37.00,37.00,x))
df1['WindSpeed3pm']=df1['WindSpeed3pm'].apply(lambda x: np.where(x>40.50,40.50,x))

In [None]:
df1.describe()

Before and after handling Outliers

In [None]:
df.boxplot(column=['Rainfall','Evaporation','WindGustSpeed','WindSpeed9am','WindSpeed3pm'])

In [None]:
df1.boxplot(column=['Rainfall','Evaporation','WindGustSpeed','WindSpeed9am','WindSpeed3pm'])

# Identify Target vairable

In [None]:
y=df1['RainTomorrow']
X=df1.drop(labels=['RainTomorrow','Date'], axis=1)

In [None]:
from sklearn.preprocessing import label_binarize, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Pipeline & Column Transformer
Create pipeline & Column Transforment to Transform the Categorical & Numerical Features

In [None]:
encode=OneHotEncoder()
scaler = MinMaxScaler()
from sklearn.compose import make_column_transformer, ColumnTransformer
#encode.fit_transform(X_train[['Location']])

In [None]:
X_train.select_dtypes('float').columns

In [None]:
num_cols=['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm']

In [None]:
cat_cols=['Location','WindGustDir','WindDir9am','WindDir3pm','RainToday']

In [None]:
#column_transformer = make_column_transformer(
#(encode,['Location','WindGustDir','WindDir9am','WindDir3pm','RainToday']),
#remainder='passthrough')
column_transformer1 = ColumnTransformer(
[('cat_feat',encode,cat_cols),
('num_feat',scaler,num_cols)
],
remainder='passthrough')

In [None]:
X_train=column_transformer1.fit_transform(X_train)

In [None]:
X_test =column_transformer1.transform(X_test)

# Target Variable

Lets encode the Target variable

In [None]:
y_train=y_train.map({'Yes':1,'No':0})
y_test=y_test.map({'Yes':1,'No':0})

# Create & Train Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model=LogisticRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

# Reports to test the model accuracy

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True)

# Result
Model can predict the Rain with 83% accuracy Score