In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
 
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Dataset

In [None]:
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
df.head()

# Dataset Information

In [None]:
df.info()

### Shape

In [None]:
df.shape

### Duplicate Row Counts

In [None]:
df.duplicated().sum()

#### Statistics Descriptive

In [None]:
df.describe()

# EDA

## Pointer EDA

1. Bagaimana perbandingan antara jumlah hari hujan dan hari tidak hujan?
2. Lokasi mana saja yang sering mengalami hujan?
3. Bulan apa saja yang sering mengalami hujan?
4. Arah angin mana yang paling mempengaruhi hujan?
5. Bagaimana distribusi data dari tiap feature?
6. Bagaimana hubungan antara temperatur, tekanan, dan kelembapan terhadap prediksi hujan?

### 1. Target Feature Count Comparison

In [None]:
fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(10,4))
labels=['No', 'Yes']

sns.countplot(x= df.RainTomorrow, data=df, palette='pastel', ax=ax[0])

df['RainTomorrow'].value_counts().plot.pie(autopct="%1.2f%%", ax=ax[1], colors=['#66b3ff','#ffcc99'],labels=labels, explode = (0, 0.1), startangle=90)

plt.show()

### 2. Total hari hujan pada lokasi

In [None]:
loc_rain_freq = df[df['RainTomorrow'] == 'Yes'].groupby('Location').count().reset_index().sort_values(by=['RainTomorrow'], ascending=False)
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="RainTomorrow", y="Location", data=loc_rain_freq.head(), color="b")
ax.set_xlabel('Total Rainy Days')

### 3. Total hujan di tiap bulan

In [None]:
df['DateTime'] = pd.to_datetime(df['Date'].copy())
df['DateTime']

In [None]:
df['Year'] = df['DateTime'].dt.year
df['Month'] = df['DateTime'].dt.month.apply(str)
df['Day'] = df['DateTime'].dt.day

In [None]:
month_rain_freq = df[df['RainTomorrow'] == 'Yes'].groupby('Month').count().reset_index().sort_values(by=['RainTomorrow'], ascending=False)
sns.set_theme(style="whitegrid")
ax = sns.barplot(y="Month", x='RainTomorrow', data=month_rain_freq.head(), color="b")
ax.set_xlabel('Total Rainy Days')

### 4. Arah angin

In [None]:
wdir_rain_freq = df[df['RainTomorrow'] == 'Yes'].groupby('WindGustDir').count().reset_index().sort_values(by=['RainTomorrow'], ascending=False)
sns.set_theme(style="whitegrid")
ax = sns.barplot(y="WindGustDir", x='RainTomorrow', data=wdir_rain_freq.head(), color="b")
ax.set_xlabel('Total Rainy Days')

### 5. Distribusi Data

##### Numerical Column

In [None]:
# Digunakan untuk memanggil kolom numerical di fungsi-fungsi berikutnya
def num_col(data):
    var_name = []
    for col in data.columns:
        if data[col].dtype != object:
            var_name.append(col)
    return var_name

num_label = num_col(df)
num_label.remove('DateTime')
num_label.remove('Year')
num_label.remove('Day')
num_label

In [None]:
plt.figure(figsize=(25,30))
for i,col in enumerate(df[num_label].columns,1):
    plt.subplot(8,2,i)
    sns.histplot(df,x=col, hue='RainTomorrow', bins=100)
plt.show()

##### Visualisasi Distribusi Data

In [None]:
plt.figure(figsize=(25,30))
for i,col in enumerate(df[num_label].columns,1):
    plt.subplot(8,2,i)
    sns.histplot(df,x=col, hue='RainTomorrow', bins=100)
plt.show()

### 6. Hubungan antar Feature

In [None]:
df_num = pd.concat([df[num_label],df['RainTomorrow']],axis = 1)
sns.pairplot( df_num[45000:46000], kind= 'reg', hue= 'RainTomorrow', palette = 'husl' )

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df[num_label].corr(), annot=True)

# Feature Engineering

### Compiled Dataset Information

In [None]:
def col_info(data):
    var_name = []
    data_type = []
    data_total = []
    nan_count = []
    nan_percentage = []
    unique_count = []
    
    for col in data.columns:
        var_name.append(col)
        data_type.append(data[col].dtype)
        data_total.append(data[col].count())
        nan_count.append(data[col].isnull().sum())
        nan_percentage.append(data[col].isnull().sum()/len(df)*100)
        unique_count.append(data[col].nunique())
    
    data_info = {"Variable" : var_name,
                "Data Total" : data_total,
                "Unique Value Total" : unique_count,
                "Missing Data Total" : nan_count,
                "Missing Data %" : nan_percentage,
                "Data Type" : data_type
                }
    
    info = pd.DataFrame(data_info)
    
    return info

col_info(df)

### Missing Value Handling

In [None]:
df1 = df.copy()

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(10,6))
sns.heatmap(df1.isnull(),yticklabels=False,cbar=False)
plt.show()

#### Imputation Missing Value

##### Numerical by Median

In [None]:
df1.isnull().sum()

In [None]:
for col in num_label:
    num_median=df1[col].median()
    df1[col].fillna(num_median, inplace=True) 

In [None]:
df1.isnull().sum()

##### Categorical by Mode

In [None]:
# Digunakan untuk memanggil kolom categorical di fungsi-fungsi berikutnya
def cat_col(data):
    var_name = []
    for col in data.columns:
        if data[col].dtype == object:
            var_name.append(col)
    return var_name

cat_label = cat_col(df1)
cat_label

In [None]:
for col in cat_label:
    cat_mode=df1[col].mode()[0]
    df1[col].fillna(cat_mode, inplace=True) 

In [None]:
df1.isnull().sum()

#### Visualization after Missing Value Handling

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(10,6))
sns.heatmap(df1.isnull(),yticklabels=False,cbar=False)
plt.show()

### Outlier Detection

In [None]:
plt.figure(figsize=(20,50))
for i,col in enumerate(df[num_label].columns,1):
    plt.subplot(8,2,i)
    sns.boxplot(data=df,x=col)
plt.show()

In [None]:
def range_out(data):
    var = data.name
    
    IQR = data.quantile(0.75) - data.quantile(0.25)

    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)

    Min = data.min()
    if Min < 0:
        Min = 0
    Max = data.max()

    Lower_fence = Q1 - (IQR * 3)
    if Lower_fence < 0:
        Lower_fence = 0
    Upper_fence = Q3 + (IQR * 3)

    val = {'var': var,'iqr' : IQR, 'min' : Min, 'max' : Max, 'lower_fence' : Lower_fence, 'upper_fence' : Upper_fence}
    
    return val

var_name = []
iqr = []
min_ = []
max_ = []
out_lower = []
out_upper = []

for col in df[num_label].columns:
    rng_out = range_out(df[col])
        
    var_name.append(rng_out['var'])
    iqr.append(rng_out['iqr'])
    min_.append(rng_out['min'])
    max_.append(rng_out['max'])
    out_lower.append(rng_out['lower_fence'])
    out_upper.append(rng_out['upper_fence'])
        
out_info = {'Column' : var_name,
            'IQR' : iqr,
            'Min' : min_,
            'Max' : max_,
            'out_lower' : out_lower,
            'out_upper' : out_upper}

outlier = pd.DataFrame(out_info)

outlier

### New Feature

In [None]:
df1['AvgTemp'] = (df1['MaxTemp'] + df1['MinTemp'])/2
df1['AvgWindSpd'] = (df1['WindGustSpeed'] + df1['WindSpeed9am'] + df1['WindSpeed3pm'])/3
df1['AvgHum'] = (df1['Humidity3pm'] + df1['Humidity9am'])/2
df1

In [None]:
df1.drop(['Date','DateTime','MinTemp','MaxTemp','WindSpeed9am','WindSpeed3pm','Humidity3pm','Humidity9am','Pressure9am','Pressure3pm','Temp9am','Temp3pm'], inplace= True, axis=1)

In [None]:
df1['Month'] = df1['Month'].astype(float)
# Digunakan untuk memanggil kolom categorical di fungsi-fungsi berikutnya
def cat_col(data):
    var_name = []
    for col in data.columns:
        if data[col].dtype == object:
            var_name.append(col)
    return var_name

cat_label = cat_col(df1)
cat_label

### Encode Categorical Data

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

In [None]:
for col in cat_label:
    df1[col] = le.fit_transform(df1[col])

In [None]:
df1

### Standard Scaler

In [None]:
X = df1.copy().drop('RainTomorrow', axis=1)
y = df1[['RainTomorrow']]

In [None]:
X_col = X.columns
y_col = y.columns

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [None]:
X = scaler.fit_transform(X)

In [None]:
X = pd.DataFrame(X, columns=X_col)

In [None]:
X.describe()

In [None]:
plt.figure(figsize=(40,40))
for i,col in enumerate(X.columns,1):
    plt.subplot(9,2,i)
    sns.histplot(X,x=col, bins=100)
plt.show()

### Split Data Train & Test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
X_train.shape

In [None]:
y_train.shape

### Oversampling SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smt = SMOTE()

X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

In [None]:
X_train_sm.shape

In [None]:
y_train_sm.shape

# Model Training

## Decision Tree

### Training

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

### Score

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, plot_confusion_matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear', random_state=44)

logreg.fit(X_train, y_train)

In [None]:
y_pred_test = logreg.predict(X_test)

y_pred_test

In [None]:
logreg.score(X_train, y_train)

In [None]:
logreg.score(X_test, y_test)

In [None]:
cm = confusion_matrix(y_test, y_pred_test)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
print(classification_report(y_test, y_pred_test))

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors = 3)

neigh.fit(X_train,y_train)

In [None]:
y_predict = neigh.predict(X_test)

#### Score

In [None]:
cm = confusion_matrix(y_test, y_predict)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
print(classification_report(y_test, y_predict))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

### Score

In [None]:
cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
df['RainTomorrow'].value_counts()[0]/df['RainTomorrow'].count()