In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
df

# Exploratory Data Analysis (EDA)

In [None]:
print(df.shape)
print('=' * 50)
print(df.info())

In [None]:
df.describe(include='all')

## checking missing value

In [None]:
df.isnull().sum()

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
percent1 = df.isnull().sum() / df.isnull().count() * 100
percent2 = percent1.sort_values(ascending=False)
missing_data = pd.concat([total, percent2], axis=1, keys=['Total', 'Percentage Null %'])
missing_data

In [None]:
df.duplicated().sum()

## Visualisation Data Distribution

In [None]:
df_cat = df[['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']]
df_num = df.drop(columns=['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow'], axis=1)

In [None]:
# Visualisasi Korelasi antar kolom
plt.figure(figsize=(16,8))
sns.heatmap(df.corr(), annot=True)

In [None]:
df.hist(figsize=(20,10), grid=False)

In [None]:
fig, ax = plt.subplots(3, 3, figsize=(15, 15))

for cat, ax in zip(df_cat, ax.flatten()):
    sns.countplot(cat, data=df, ax=ax)

In [None]:
sns.pairplot(df)

In [None]:
# Checking Outliers
plt.figure(figsize=(20,10))
df.boxplot()

In [None]:
import datetime

df_rain = df[['Date', 'Rainfall']]
df_rain['Year'] = df['Date'].apply(lambda x : datetime.datetime.strptime(x, '%Y-%m-%d').strftime('%Y'))
df_rain['Month'] = df['Date'].apply(lambda x : datetime.datetime.strptime(x, '%Y-%m-%d').strftime('%Y-%m'))

In [None]:
# grafik rata" curah hujan selama 10 th.
plt.figure(figsize=(20,10))
df_rain.groupby('Year')['Rainfall'].mean().plot(grid=True)
plt.show()

In [None]:
# grafik rata" curah hujan perbulan selama 10 th
plt.figure(figsize=(20,10))
df_rain.groupby('Month')['Rainfall'].mean().plot(grid=True)
plt.show()

In [None]:
# curah hujan di berbagai wilayah di Australia
plt.figure(figsize=(20,10))
df.groupby('Location')['Rainfall'].mean().sort_values().plot(kind='barh', grid=True)
plt.show()

# Data PreProcessing

## Handling missing value

In [None]:
# Drop kolom dengan Missing Value di atas 35%
df = df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1)

In [None]:
# Membuat variabel sesuai dengan tipe data
df_cat = df[['WindGustDir','WindDir9am','WindDir3pm','RainToday','RainTomorrow','Date','Location']]
df_num = df.drop(['WindGustDir','WindDir9am','WindDir3pm','RainToday','RainTomorrow','Date','Location'], axis=1)

In [None]:
# Looping Handling Missig value df_cat

for col in df_cat.columns.values:
    if df[col].isnull().sum() == 0:
        continue
    df_cat[col] = df.groupby('Location')[col].apply(lambda x: x.fillna(x.mode().max()))

In [None]:
df_cat.isnull().sum()

In [None]:
# Handling missing Value kolom WindGustDir
df_cat['WindGustDir'] = df['WindGustDir'].fillna(df['WindGustDir'].mode().max())

In [None]:
# Looping Handling Missing Value df_num

for col in df_num.columns.values:
    if df[col].isnull().sum() == 0:
        continue
    df_num[col] = df.groupby('Location')[col].apply(lambda x: x.fillna(x.mean()))

In [None]:
# Handling missing value kolom yang masih ada null nya
df_num['WindGustSpeed'] = df_num['WindGustSpeed'].fillna(df['WindGustSpeed'].mean())
df_num[['Pressure9am', 'Pressure3pm']] = df_num[['Pressure9am', 'Pressure3pm']].fillna(df[['Pressure9am', 'Pressure3pm']].mean())

In [None]:
df_num.isnull().sum()

In [None]:
# Me Replace kolom RainToday dan RainTomorrow menjadi numerik 0 dan 1
d = {'Yes':1, 'No':0}
df_cat['RainToday'] = df_cat['RainToday'].map(d)
df_cat['RainTomorrow'] = df_cat['RainTomorrow'].map(d)

In [None]:
df_cat2 = df_cat[['WindGustDir','WindDir9am','WindDir3pm','Location']]

# Me Replace tipe data kategori menjadi value counts
df_cat2['WindGustDir'] = df_cat2['WindGustDir'].map(df_cat2['WindGustDir'].value_counts())
df_cat2['WindDir9am'] = df_cat2['WindDir9am'].map(df_cat2['WindDir9am'].value_counts())
df_cat2['WindDir3pm'] = df_cat2['WindDir3pm'].map(df_cat2['WindDir3pm'].value_counts())
df_cat2['Location'] = df_cat2['Location'].map(df_cat2['Location'].value_counts())

In [None]:
df_new = pd.merge(df_num, df_cat2, left_index=True, right_index=True)
df_new

## Standarisasi

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df_new)
df_scaler = pd.DataFrame(scaler.fit_transform(df_new), columns=df_new.columns)

In [None]:
df_x = pd.merge(df_scaler, df_cat['RainToday'], left_index=True, right_index=True)

In [None]:
df_x.hist(bins=50, figsize=(20,10), grid=False)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(), annot=True)

In [None]:
df_x.drop('Temp9am', axis=1, inplace=True)
df_x.drop('Temp3pm', axis=1, inplace=True)

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_x, df_cat['RainTomorrow'], test_size=0.2, random_state=50)

In [None]:
# Oversampled

from imblearn.over_sampling import SMOTE
from collections import Counter

sm = SMOTE(random_state=37)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print('Sebelum {}'.format(Counter(y_train)))
print('Sesudah {}'.format(Counter(y_train_res)))

# Modelling

In [None]:
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(max_depth=3)
model_tree.fit(X_train_res, y_train_res)

y_dt_pred = model_tree.predict(X_test)

model_tree.score(X_train_res, y_train_res)

In [None]:
print('Confusion Matrix \n {}'.format(confusion_matrix(y_test, y_dt_pred)))
print('Accuracy Score {:.2f}'.format(accuracy_score(y_test, y_dt_pred) * 100))
print(classification_report(y_test, y_dt_pred))