In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing the Required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the csv file and checking the first five rows
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

In [None]:
# Checking for the shape of the dataset
df.shape

So, there are 1 lakh 45k rows with 23 features

In [None]:
# Checking the basic info of the dataset
df.info()

Except the date column, all other features are identified with their right datatype
Let's change the dtype of the date column.

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# Creating different columns based on the date feature, for further usage
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Specific_Day']= df['Date'].dt.day

In [None]:
# Checking for null values in the dataset
(df.isnull().sum()/len(df) *100).sort_values(ascending=False)

#### We can see that, only 5 columns does not have null values. For the columns with null values accounting to less than 7%, these null values can be dropped and the null values in the rest of the columns should be replaced with median or mode depending on its datatype

In [None]:
# Let us first drop the null values from the columns which has less than 7% null values 
df = df.dropna(subset=['MaxTemp','MinTemp','WindSpeed9am','Temp9am','Humidity9am','WindSpeed3pm','Rainfall','RainToday'
                      ,'RainTomorrow','Temp3pm','WindDir3pm','Humidity3pm'])

In [None]:
# Filling the null values of the categorical columns with the mode
df['WindDir9am'] = df['WindDir9am'].fillna(df['WindDir9am'].mode()[0])
df['WindGustDir'] = df['WindGustDir'].fillna(df['WindGustDir'].mode()[0])

In [None]:
# Filling the null values of the numerical columns with the median 
# Since we do not if the features are skewed or not, it is better to fill them with median
df['Sunshine'] = df['Sunshine'].fillna(df['Sunshine'].median())
df['Evaporation'] = df['Evaporation'].fillna(df['Evaporation'].median())
df['Cloud3pm'] = df['Cloud3pm'].fillna(df['Cloud3pm'].median())
df['Cloud9am'] = df['Cloud9am'].fillna(df['Cloud9am'].median())
df['Pressure9am'] = df['Pressure9am'].fillna(df['Pressure3pm'].median())
df['Pressure3pm'] = df['Pressure3pm'].fillna(df['Pressure3pm'].median())
df['WindGustSpeed'] = df['WindGustSpeed'].fillna(df['WindGustSpeed'].median())

In [None]:
# Let us check if the null values are all replaced
df.isnull().sum()

So, the null values are all replaced with appropriate values.

In [None]:
# Checking if there are null values in the form of question mark symbol
df[df=='?'].count()

#### Distribution of the Target Variable

In [None]:
plt.rcParams['figure.figsize'] = 8,4
fig,ax = plt.subplots(1,2)
df['RainTomorrow'].value_counts().plot(kind='bar',rot=0,ax=ax[0],cmap='Set2')
df['RainTomorrow'].value_counts().plot(kind='pie',autopct='%.1f%%',ax=ax[1],cmap='Set2',explode=[0,0.1])
plt.show()

#### So, there is clear imbalance in our dataset. 

In [None]:
# Creating new dataframe for numerical and categorical variables
df_num = df.select_dtypes(np.number)
df_cat = df.select_dtypes('object')

In [None]:
# Five point summary of the numeric variables
df.describe()

#### Inferences from the describe function:
* The mintemp varies between -8 and 33, and the mean and median are almost same, so the distribution is not skewed
* The maxtemp varies between -4 and 48, here the mean is slightly higher than the median, so it is slightly skewed
* The rainfall varies between 0 to 371, the mean is clearly higher than the median, so it skewed. So, there are many days with no rainfall. The mean is less but the max is very high, which means that, if it rains, it will be raining heavily, but many days rain was not observed.
* The evaporation varies between 0 and 82, here the mean and median are slightly different, so the distribution is slightly skewed. It is same as observed from rainfall feature, from the mean and median, and also from 75% quantile it is very clear that, most of the days rainfall is not observed, but on days where it is observed, it is high.
* The sunshine varies between 0 and 14, the mean and median are almost equal so it follows a normal distribution. The max is not very high, so there were many sun shiny days.
* The windgustspeed is about a sudden burst of wind speed, it varies between 6 and 135, the mean and median are near so it is slightly skewed, and most of the days this value is less and only in rare cases it was very high.
* The windspeed9am varies between 0 and 87, and the mean and median are slightly different so the distribution is skewed. Similarly, the normal value and the max is very much different, so we can guess that on rainy days the windspeed is high.
* The windspeed3pm varies between 2 and 87, the mean and median are different, so it is skewed. The windspeed at 3pm is usually higher than that at 9am, we can observe that.
* The humidity9am varies between 0 and 100, the mean and median are different, so it is skewed. It is clearly left skewed.
* The humidity39m varies between 0 and 100, the mean and median are different, so it is skewed. It is clearly left and right skewed.
* The Pressure9am varies between 980 and 1041, the mean and median are different, so it is skewed.
* The pressure3pm varies between 977 and 1039, the mean and median are different, so it is skewed. The pressure9am and pressurep3pm are almost the same.
* The colud9am and cloud3pm has almost identical values and both varies between 0 and 9 and the mean and median of them are not same, so it is skewed.
* The temp9am is comparatively less than temp3pm, both are not much skewed.
* The year, month, day are distributed as usual.

### Univariate Analysis

In [None]:
# For the numerical columns
plt.rcParams['figure.figsize'] = 15,5
for col in df.select_dtypes(np.number):
    fig,ax= plt.subplots(1,3)
    print(col,':')
    sns.distplot(df_num[col],ax=ax[0], color='Green')
    sns.boxplot(df_num[col], ax=ax[1], palette='Greens')
    sns.violinplot(df_num[col],ax=ax[2], palette='Greens')
    plt.show()

#### Inferences from the univariate analysis of the numerical columns:
* The mintemp almost follows a normal distribution with outliers in both sides
* Te maxtemp is left skewed more than right skewed
* Rainfall is clearly right skewed and as we saw from describe, the days when rain was there it was high.
* Similar to rainfall, evaporation is highly right skewed, whenever there is rain, it is having higher values
* Sunshine has outliers on both sides, which means there were only less days when the sunshine was warm, many days it was cloudy and equally many days it was a sunshiny day.
* Windgustspeed is right skewed,  as only on rainy days the windgustspeed was very high
* Windspeed9am and Windspeed3pm are right skewed too
* Humidity9am is left skewed and humidity3pm is normally distributed.
* Pressure9am has outliers in both sides and same applies for pressure3pm, the pressure is either too less or too high on rainy days
* Cloud9am and cloud3pm follows almost normal distribution
* Temp9am and temp3pm follows similar distribution with outliers on both sides
* Year, month and specific_day denote 3 columns of the date feature


In [None]:
# For the categorical variable
plt.rcParams['figure.figsize'] = 12,5
for col in df_cat:
    fig ,ax = plt.subplots(1,2)
    print(col,':')
    df_cat[col].value_counts().plot(kind='bar',rot=0, ax=ax[0],cmap='summer')
    df_cat[col].value_counts().plot(kind='pie',autopct='%.1f%%',ax=ax[1],cmap='Spectral')
    plt.show()

#### Inferences from the univariate analysis of the Categorical columns
* From the location column we can observe that almost all the locations contribute to the dataset equally
* The windgustdir is more from the West direction compared to all the other directions
* The winddir9am is more from the North direction compared to all the other directions
* The winddir3pm is more from the South-East direction followed almost equally by West and South directions
* As we guessed from univariate of numerical features, most of the days there was no rain only 22% times of the days, rain occurred.

### Bivariate Analysis

In [None]:
plt.rcParams['figure.figsize']=10,4
for col in df_num:
    fig,ax= plt.subplots(1,2)
    print(col,'Vs RainTomorrow\n')
    sns.boxplot(df['RainTomorrow'],df_num[col],ax=ax[0],palette='summer')
    sns.violinplot(df['RainTomorrow'],df_num[col],ax=ax[1],palette='summer')
    plt.show()

#### Inferences from bivariate analysis of numerical columns:
* Whenever the mintemp is very less and very high, those time, Rain is not observed on the next day
* Maxtemp doesn't seem to have much effect on the RainTomorrow, we can only tell that when the temperature is between 15-25, there are more chances of rain
* As we guessed, most of the times when rainfall value is very high, rain is observed on the following day
* Evaporation is lightly contradictory as, when evaporation value is at its maximum,  rain is not observed the next day, it does not have much effect on Rain tomorrow
* Sunshine is having little effect on raintomorrow as, when sunshine is less,  chances of getting raintomorrow is more
* Windgustspeed, if very high, then surely chances of raintomorrow
* Windspeed9am doesn't seem to have much effect on Raintomorrow as even if it is high, there is almost equal chance for it to rain
* Windspeed3pm if very high, then rain occurs
* More chances of rain when humidity is very high
* Pressure when very less, there are chances of getting rain
* When cloud value is very high, less chances of getting rain tomorrow
* Mostly rain occurs with normal temperature between 10-20 and equal chances of not getting rain
* Year, month and day doesn't seem to have much effect on rain, as it is raining uniformly

In [None]:
plt.rcParams['figure.figsize'] = 8,6
for col in df_cat:
    print(col,'Vs RainTomorrow\n')
    sns.countplot(df_cat[col],hue=df['RainTomorrow'],palette='rainbow')
    plt.show()

#### Inferences from the bivariate analysis of the categorical columns:
* Location does not seem to have much effect on Raintomorrow as only very few regions have very less chances of getting rain, all other regions are having good chances for rain
* Windgustdir seems to have a little effect on raintomorrow, when the value is very high then there is some chance for getting rain 
* Winddir9am is not having much effect on raintomorrow, in all directions the chance for rain is almost equal
* Winddir3pm is similar to winddir9am , all directions seem to have equal chances for getting rain
* It is a very informative variable as, if it rains today, then there is very much highchance that it will also rain tomorrow, if it does not rain today, then less chances for rain tomorrow

### Hypothesis Testing

##### For categorical Vs categorical, we perform chisquare test of independence
#### Null hypothesis: The feature does not have an effect on RainTomorrow
#### Alternate hypothesis: The feature has an effect on RainTomorrow

In [None]:
from scipy.stats import chi2_contingency
for col in df_cat:
    print('\n',col,'Vs RainTomorrow\n')
    print(chi2_contingency(pd.crosstab(df_cat[col],df['RainTomorrow'])))

#### So, the pvalue is less than alpha in all these columns so we can conclude that all the categorical features has an effect on RainTomorrow column

##### For numerical features with the target column, we perform normality test to decide upon the test to be performed.
#### Null hypothesis: Data is not skewed; skewness=0
#### Alternate hypothesis: Data is skewed; skewness!=0

In [None]:
from scipy.stats import shapiro
for i in df_num:
    print(i,'Vs RainTomorrow')
    st1= df[df['RainTomorrow']=='Yes'][i]
    st2= df[df['RainTomorrow']=='No'][i]
    print(shapiro(st1))
    print(shapiro(st2))
    print('\n')

#### Everywhere, the pvalue is less than alpha, so the data is skewed. When data is skewed, we have to perform mannwhineyu test.

#### Null hypothesis : The feature does not have an effect on Target variable
#### Alternate hypothesis: The feature has an effect on Target variable

In [None]:
from scipy.stats import mannwhitneyu
for i in df_num:
    z=df[df['RainTomorrow']=='Yes'][i]
    w=df[df['RainTomorrow']=='No'][i]
    print('%s with RainTomorrow, pvalue is:'%i,mannwhitneyu(z,w)[1])
    print('\n')

#### Here too the pvalues are all less than alpha, so we can conclude that all the numerical columns has an effect on the target variable

### Transformation and Encoding

In [None]:
# Since the data is skewed, we transform them, so that it will be good for building the model
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
for i in df.select_dtypes(np.number):
    df[i] = pt.fit_transform(df[[i]])

In [None]:
# Changing the values in the target variable
df['RainTomorrow'] = df['RainTomorrow'].map({'Yes':1,'No':0})
df['RainTomorrow'] = df['RainTomorrow'].astype('int')

In [None]:
# Encoding the categorical features with label encoding technique
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in df.select_dtypes('object'):
    df[i] = le.fit_transform(df[[i]])

In [None]:
# Dropping the date column before building the model as it is already split into 3 other features
df= df.drop('Date',axis=1)

### Model building

In [None]:
# Seperating the dependent variable and the independent variables
X = df.drop('RainTomorrow',axis=1)
y = df['RainTomorrow']

In [None]:
# Seperating into train and test data
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,y,train_size=0.7,random_state=42)

In [None]:
# Scaling the train and test data seperately so as the model will not be biased towards values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for i in xtrain.columns:
    xtrain[i]= sc.fit_transform(xtrain[[i]])
for i in xtest.columns:
    xtest[i]= sc.fit_transform(xtest[[i]])

In [None]:
# Importing the library for balancing the target variable
import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
# Balancing the train set using SMOTE technique
sm = SMOTE()
x_sm, y_sm = sm.fit_resample(xtrain,ytrain)

In [None]:
# Importing the metrices
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr = lr.fit(x_sm ,y_sm)
ypred_lr = lr.predict(xtest)
print('Training Score',lr.score(x_sm,y_sm))
print('Testing Score',lr.score(xtest,ytest))
print('\n',classification_report(ytest,ypred_lr))
print(confusion_matrix(ytest,ypred_lr))

##### Checking for multicollinearity 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

VIF = pd.DataFrame([vif(X.values,i) for i in range(X.shape[1])],columns=['VIF'],index=X.columns)
VIF

#### Backward Feature elimination

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
sfs2 = sfs(lr, forward=False, verbose=False, scoring='neg_mean_squared_error', k_features=20)
sfs2 = sfs2.fit(x_sm,y_sm)
feat_names = list(sfs2.k_feature_names_)
print(feat_names)

##### Based on VIF and Backward elimination we can eliminate few features

In [None]:
x_sm = x_sm.drop(['MaxTemp','Temp3pm','Year','Specific_Day'],axis=1)
xtest = xtest.drop(['MaxTemp','Temp3pm','Year','Specific_Day'],axis=1)

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=5,criterion='entropy',min_samples_split=7)
dt = dt.fit(x_sm,y_sm)
ypred_dt = dt.predict(xtest)
print('Training Score', dt.score(x_sm,y_sm))
print('Testing Score', dt.score(xtest,ytest))
print('\n',classification_report(ytest,ypred_dt))
print(confusion_matrix(ytest,ypred_dt))

In [None]:
plt.rcParams['figure.figsize'] = 6,4
from sklearn.metrics import roc_auc_score,roc_curve
y_prob_dt = dt.predict_proba(xtest)[:,1]
auc_dt = roc_auc_score(ytest,y_prob_dt)
fpr_dt , tpr_dt, thr_dt = roc_curve(ytest,y_prob_dt)
print('AUC:',auc_dt)
plt.plot([0,1],[0,1],linestyle='--')
plt.plot(fpr_dt,tpr_dt)
plt.show()

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200,max_depth=8,min_samples_split=7)
rf = rf.fit(x_sm,y_sm)
ypred_rf = rf.predict(xtest)
print('Training Score',rf.score(x_sm,y_sm))
print('Testing Score', rf.score(xtest,ytest))
print('\n',classification_report(ytest,ypred_rf))
print(confusion_matrix(ytest,ypred_rf))

In [None]:
y_prob_rf = rf.predict_proba(xtest)[:,1]
auc_rf = roc_auc_score(ytest,y_prob_rf)
fpr_rf , tpr_rf, thr_rf = roc_curve(ytest,y_prob_rf)
print('AUC:',auc_rf)
plt.plot([0,1],[0,1],linestyle='--')
plt.plot(fpr_rf,tpr_rf)
plt.show()

#### Ada Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(n_estimators=50)
adb = adb.fit(x_sm,y_sm)
ypred_abd = adb.predict(xtest)
print('Training Score',adb.score(x_sm,y_sm))
print('Testing Score', adb.score(xtest,ytest))
print('\n',classification_report(ytest,ypred_abd))
print(confusion_matrix(ytest,ypred_abd))

In [None]:
y_prob_adb = adb.predict_proba(xtest)[:,1]
auc_adb = roc_auc_score(ytest,y_prob_adb)
fpr_adb , tpr_adb, thr_adb = roc_curve(ytest,y_prob_adb)
print('AUC:',auc_adb)
plt.plot([0,1],[0,1],linestyle='--')
plt.plot(fpr_adb,tpr_adb)
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=50,max_depth=4)
gb = gb.fit(x_sm,y_sm)
ypred_gb = gb.predict(xtest)
print('Training Score',gb.score(x_sm,y_sm))
print('Testing Score',gb.score(xtest,ytest))
print('\n', classification_report(ytest,ypred_gb))
print(confusion_matrix(ytest,ypred_gb))

In [None]:
y_prob_gb = gb.predict_proba(xtest)[:,1]
auc_gb = roc_auc_score(ytest,y_prob_gb)
fpr_gb , tpr_gb, thr_gb = roc_curve(ytest,y_prob_gb)
print('AUC:',auc_gb)
plt.plot([0,1],[0,1],linestyle='--')
plt.plot(fpr_gb,tpr_gb)
plt.show()

### Random Forest is the best model from the above models, checking the feature importance based on RF model

In [None]:
cm = confusion_matrix(ytest,ypred_rf)
sns.heatmap(cm,annot=True,fmt='g',cmap='coolwarm_r')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Random Forest Confusion Matrix')
plt.yticks(rotation=0)
plt.show()

In [None]:
from sklearn.metrics import precision_score,recall_score,f1_score
print('Precision Score:',precision_score(ytest,ypred_rf))
print('Recall Score   :', recall_score(ytest,ypred_rf))
print('F1 Score       :',f1_score(ytest,ypred_rf))
print('Accuracy score :',accuracy_score(ytest,ypred_rf))
print('ROC AUC Score  :',roc_auc_score(ytest,y_prob_rf))

In [None]:
important_features = pd.DataFrame({'Features': x_sm.columns, 
                                   'Importance': rf.feature_importances_})

# print the dataframe
important_features.sort_values(by='Importance', ascending=False, inplace=True)
important_features

In [None]:
plt.rcParams['figure.figsize'] = 8,5
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)
plt.show()

* Humidity3pm has the most influence on the RainTomorrow feature
* So, this should be considered for predicting if it will rain tomorrow
* Followed by the features, Cloud3pm, Rainfall , Sunshine and RainToday have more effect on the RainTomorrow feature.
#### The model is 76% accurate and the false negative is also very less. So, this model has a good potential.