In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing and understanding data

In [None]:
train_data = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
train_data.head()

In [None]:
train_data.columns

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
#converting date string to datetime

train_data['Date'] = pd.to_datetime(train_data['Date'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Analysing null values in the entire data

plt.figure(figsize=(15,6))
sns.heatmap(train_data.isnull())

In [None]:
#percentage of null values in each column

(train_data.isnull().sum()/len(train_data))*100

# Data cleaning

In [None]:
#separating numeric and non numeric columns

contin = []
category = []

for col in train_data.columns:
    if train_data[col].dtype == 'object':
        category.append(col)
    
    else:
        contin.append(col)

In [None]:
#imputing numeric columns with the mean of every column 

for item in contin:
    if train_data[item].isnull().any:
        train_data[item] = train_data[item].fillna(train_data[item].mean())
    else:
        pass

In [None]:
train_data['RainToday'].value_counts()

In [None]:
#Encoding columns with yes and no to 0 and 1

train_data['RainToday'] = train_data['RainToday'].map({'Yes': 1,'No': 0})

In [None]:
train_data['RainToday'].unique()

In [None]:
train_data['RainTomorrow'] = train_data['RainTomorrow'].map({'Yes': 1,'No': 0})

In [None]:
#imputing categorical columns with the mode of every column 

for col in category:
    
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0])

In [None]:
train_data['RainTomorrow'].fillna(0,inplace=True)

In [None]:
train_data.isnull().sum()

In [None]:
#let's label encode the object columns

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [None]:
for col in category:
    train_data[col] = le.fit_transform(train_data[col])

In [None]:
train_data.head()

# Lets visualize some trends in the dataset

In [None]:
plt.figure(figsize=(13,7))

sns.heatmap(train_data.corr(),annot=True)

In [None]:
plt.figure(figsize=(11,7))
sns.jointplot(x='MinTemp',y='Temp9am',kind='scatter',data=train_data,palette='viridis',hue='RainToday')

In [None]:
sns.kdeplot(x='MaxTemp',y='Temp3pm',data=train_data,fill=True,weights=1,hue='RainToday')

In [None]:
#dropping the highly correlated and unnecessary columns

train_data.drop(['Temp3pm','Temp9am','Date','Location'],axis=1,inplace=True)

In [None]:
X = train_data.drop('RainTomorrow',axis=1)
y = train_data['RainTomorrow']

# Model building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=400)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
pred = rfc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
print(classification_report(y_test,pred))