In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 999

# Import Datasets

In [2]:
df = pd.read_csv("datasets/weatherAUS.csv", parse_dates=["Date"])

Lakukan Ekstraksi pada kolom Date

In [3]:
# buat kolom year yang berisi data tahun hasil ekstraksi kolom date
df['year'] = pd.DatetimeIndex(df['Date']).year

# buat kolom month yang berisi hasil ekstraksi bulan pada kolom date
df['month'] = pd.DatetimeIndex(df['Date']).month

# buat kolom day yang berisi hari
df['day'] = pd.DatetimeIndex(df['Date']).day

# hapus kolom Date karena sudah tidak terpakai
df = df.drop(columns="Date")

# tampilkan dataframe
df.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow,year,month,day
0,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No,2008,12,1
1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No,2008,12,2
2,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No,2008,12,3
3,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No,2008,12,4
4,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No,2008,12,5


# Cek Data yang bolong

In [None]:
plot_missing_value(df, return_df=True)

# Cek kolom-kolom dalam data

In [None]:
df.info()

In [None]:
df.describe()

# Visualize
### Data Target

In [None]:
plt.figure(figsize=(8, 5))
ax = sns.countplot(df["RainTomorrow"])
plt.show()

In [None]:
plt.figure(figsize=(18, 30))

# buat for untuk menampilkan data dan kolom
for i, kolom in zip(range(1,18), df.describe().columns):
    plt.subplot(6,3,i)
    ax = sns.distplot(df[kolom], bins=80)

plt.show()