In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
/
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploring Data

In [None]:
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df = df.fillna(df.mode().iloc[0])

In [None]:
df.shape

In [None]:
df.describe().transpose()

# Preprocessing

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['WindGustDir'] = le.fit_transform(df['WindGustDir'])
df['WindDir9am'] = le.fit_transform(df['WindDir9am'])
df['WindDir3pm'] = le.fit_transform(df['WindDir3pm'])
df['RainToday'] = le.fit_transform(df['RainToday'])
df['RainTomorrow'] = le.fit_transform(df['RainTomorrow'])

In [None]:
df.dtypes

# Visualizing Data

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt 
fig = plt.figure(figsize = (20,15))
ax = fig.gca()
df.hist(ax=ax)
plt.show()

In [None]:
df = df.drop(['Date','Location'],axis=1)

In [None]:
colum_name = df.columns

## Standardization Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scale = pd.DataFrame(scaler.fit_transform(df))

In [None]:
df_scale.columns = colum_name
df_scale['RainTomorrow'] = le.fit_transform(df_scale['RainTomorrow'])

In [None]:
df_scale.head()

In [None]:
X = df_scale.drop(['RainTomorrow'],axis=1)
y = df_scale['RainTomorrow']

In [None]:
X.head()

In [None]:
from yellowbrick.target import FeatureCorrelation
feature_names = list(X.columns)

### Feature Correlation 

In [None]:
visualizer = FeatureCorrelation(labels = feature_names)
visualizer.fit(X, y)
visualizer.poof()

In [None]:
boxplot = df_scale.boxplot(figsize=(40, 12))

In [None]:
from sklearn.model_selection import train_test_split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X, y, test_size = 0.33, random_state = 42)

##  Model 1 Before Removing Outlier  

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train_s, y_train_s)

In [None]:
y_pred_s = model.predict(X_test_s)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred_s,y_test_s)

In [None]:
def drop_outliers(df,field_name):
    iqr = 1.5 * (np.percentile(df[field_name],75) - np.percentile(df[field_name],25))
    df.drop(df[df[field_name] > (iqr + np.percentile(df[field_name], 75))].index , inplace=True)
    df.drop(df[df[field_name] < (np.percentile(df[field_name], 25))].index , inplace=True)

In [None]:
df_scale.columns

In [None]:
drop_outliers(df_scale,'MinTemp')
drop_outliers(df_scale,'Rainfall')
drop_outliers(df_scale,'Evaporation')
drop_outliers(df_scale,'WindGustSpeed')
drop_outliers(df_scale,'WindSpeed9am')
drop_outliers(df_scale,'WindSpeed3pm')
drop_outliers(df_scale,'Humidity9am')
drop_outliers(df_scale,'Pressure9am')
drop_outliers(df_scale,'Pressure3pm')
drop_outliers(df_scale,'Temp9am')
drop_outliers(df_scale,'Temp3pm')

In [None]:
boxplot = df_scale.boxplot(figsize=(40, 12))

In [None]:
df_scale.shape

In [None]:
X = df_scale.drop(['RainTomorrow'],axis=1)
y = df_scale['RainTomorrow']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Model 2 After Removing Outlier 

In [None]:
model_2 = RandomForestClassifier()
model_2.fit(X_train, y_train)

In [None]:
y_pred = model_2.predict(X_test)

In [None]:
accuracy_score(y_pred,y_test)

## Model 3 Drop Less Corr. Features 

In [None]:
X_less_corr = df_scale.drop(['RainTomorrow','MaxTemp','Pressure9am','Pressure3pm','Sunshine','Temp3pm'],axis=1)
y_less_corr = df_scale['RainTomorrow']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_less_corr, y_less_corr, test_size = 0.2)

In [None]:
model_3 = RandomForestClassifier()
model_3.fit(X_train, y_train)

In [None]:
y_lcorr_pred = model_3.predict(X_test)

In [None]:
accuracy_score(y_lcorr_pred,y_test)