In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
df.head()

## Data Cleaning

### Handling Missing Values in Categorical Columns

In [None]:
categorical = [var for var in df.columns if df[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :', categorical)

In [None]:
cat1 = [var for var in categorical if df[var].isnull().sum()!=0]
print(df[cat1].isnull().sum())

In [None]:
for var in categorical:
    print(var + ' conatins '+str(len(df[var].unique()))+ " labels ")

### Splitting the Date column into respective 'Year','Month' & 'Day'.**

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

df.drop('Date',axis=1,inplace=True)

In [None]:
categorical = [var for var in df.columns if df[var].dtype=='O']
print("There are {} categorical variables : ".format(len(categorical)))
print(categorical)

### Replacing the missing categorical values by the most frequent value in respective columns. 

In [None]:
for var in categorical:
    df[var].fillna(df[var].mode()[0],inplace=True)

In [None]:
numerical = [var for var in df.columns if df[var].dtype!='O']
print(numerical)

In [None]:
num1 = df[numerical].isnull().sum()
num1 = num1[num1!=0]
num1

### Replacing the missing numercial values by the mean of their respective columns.

In [None]:
for col in num1.index:
    col_mean = df[col].mean()
    df[col].fillna(col_mean,inplace=True)

In [None]:
le = LabelEncoder()
new_df = df
for col in categorical:
    new_df[col] = le.fit_transform(df[col])
col_names = new_df.columns

In [None]:
new_df.head()

## Feature Scaling using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
ss = MinMaxScaler()
new_df = ss.fit_transform(new_df)
new_df = pd.DataFrame(new_df,columns = col_names )

In [None]:
new_df.describe()

In [None]:
# new_df.to_csv("weatherCleaned.csv")

## Data Visualization

### Heatmap of correlation among the columns of data.

In [None]:
correlation = new_df.corr()
plt.figure(figsize=(16,12))
plt.title('Correlation Heatmap of Rain in Australia Dataset')
ax = sns.heatmap(correlation, square=True, annot=True, fmt='.2f', linecolor='white',cmap='viridis')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=30)           
plt.show()

In [None]:
y = new_df.RainTomorrow
X = new_df.drop('RainTomorrow',axis=1)

In [None]:
results = []

## Splitting into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42,shuffle=True)

## Applying various classifying algorithms on the training set and predicting the RainTomorrow using training set.

### 1.1 Gaussian Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
gnb.score(X_test,y_test)

In [None]:
print(accuracy_score(y_test,y_pred))
print(cross_val_score(gnb,X_train,y_train,cv=3))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
results.append(accuracy_score(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,annot_kws={"size": 12},cmap='viridis',fmt="d")

## Observations :
>> ### GaussianNB implements gaussian naive bayes algorithm for classification.
>> ### It assumes the maximum likelihood of the features to be Gaussian and classifies the dataset accordingly.
>> ### The confusion matrix depicts that 2861 are False Positives and 6 are False Negatives.
>> ### Thus, Gaussian Naive Bayes algorithm is able to predict rain tommorrow with accuracy of 94.95%. 

## 1.2 Decision Trees

In [None]:
dtc = DecisionTreeClassifier(max_depth=10, min_samples_split=2,random_state=42)
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
dtc.score(X_test,y_test)

In [None]:
print(accuracy_score(y_test,y_pred))
print(cross_val_score(dtc,X_train,y_train,cv=3))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
results.append(accuracy_score(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,annot_kws={"size": 12},cmap='viridis',fmt="d")

## Observations :

>> ### Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. 
>> ### The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
>> ### DecisionTreeClassifier is capable of both binary classification and multiclass classification.
>> ### The confusion matrix shows that there are 0 FP or FN.
>> ### Hence, the DecisionTreeClassifier is able to predict rain tomorrow with an impressive accuracy of 100%.

## 1.3 Support Vector Machines

In [None]:
svc = LinearSVC(random_state=42)
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
svc.score(X_test,y_test)
print(cross_val_score(svc,X_train,y_train,cv=3))

In [None]:
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
results.append(accuracy_score(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,annot_kws={"size": 12},cmap='viridis',fmt="d")

## Observations :
>> ### Support vector machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection.
>> ###  LinearSVC is another implementation of Support Vector Classification for the case of a linear kernel.
>> ### This class supports both dense and sparse input and the multiclass support is handled according to a one-vs-the-rest scheme.
>> ### From the confusion matrix it is evident that 18 are FP and 2963 are FN.
>> ### Therby, the LinearSVC is able to predict rain tomorrow with 94.75% accuracy.



## 1.4 Random Forest

In [None]:
rfc = RandomForestClassifier(n_estimators=200,max_depth=10, random_state=42)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
rfc.score(X_test,y_test)

In [None]:
print(accuracy_score(y_test,y_pred))
print(cross_val_score(rfc,X_train,y_train,cv=3))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
results.append(accuracy_score(y_test,y_pred))
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,annot_kws={"size": 12},cmap='viridis',fmt="d")

## Observations :
>> ### In random forests (RandomForestClassifier and RandomForestRegressor classes), each tree in the ensemble is built from a sample drawn with replacement.
>> ### A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
>> ### The confusion matrix depicts that there are only 4 FN.
>> ### Hence, the RandomForestClassifier is able to predict rain tomorrow with 99.99% accuracy.

## Comaprison of Various Classifying algorithms

In [None]:
names = ["Naive Bayes","Decision Tree","Linear SVM","Random Forest",]
results

In [None]:
sns.barplot(names,results)

# Conclusion :

>> ### The Decison Tree Algorithm outperforms other algorithms in terms of precison,accuracy and recall.
>> ### Also,LinearSVM is the lowest in terms of accuracy.
>> ### Gaussian Naive Bayes performs well in case of binary classification.
>> ### Thus, Random Forest and Decision Trees are best suited for binary classification problems.