## Tugas Akhir Microcredential Associate Data Scientist

### ITS-05 kelompok 3

#### Preface

![index.png](attachment:index.png)

### Import Necessary Library and Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import missingno as mno

In [2]:
# import 3 periode of season
# 2018-10 ~ 2019-03
# 2019-10 ~ 2020-03
# 2020-10 ~ 2021-03

ignore_data = ["8888"] # temporary, kalau ada ide buat ngolah nilai "8888", command ini bisa dihilangkan

df = pd.read_excel('dataset_iklim_cilacap_before_cleaning.xlsx', na_values = ignore_data)

In [3]:
# Quick data check

print(f"DataFrame shape : {df.shape}\n")
print(f"DataFrame columns: \n{df.columns}\n")
print(f"Columns type: {df.dtypes}")

df.head(3)

DataFrame shape : (547, 11)

DataFrame columns: 
Index(['Tanggal', 'Tn', 'Tx', 'Tavg', 'RH_avg', 'RR', 'ss', 'ff_x', 'ddd_x',
       'ff_avg', 'ddd_car'],
      dtype='object')

Columns type: Tanggal     object
Tn         float64
Tx         float64
Tavg       float64
RH_avg     float64
RR         float64
ss         float64
ff_x       float64
ddd_x      float64
ff_avg     float64
ddd_car     object
dtype: object


Unnamed: 0,Tanggal,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car
0,01-10-2018,23.0,30.4,27.2,83.0,,3.2,5.0,140.0,2.0,E
1,02-10-2018,26.0,,27.7,79.0,0.0,,4.0,130.0,2.0,N
2,03-10-2018,25.0,30.4,27.2,80.0,,8.9,7.0,150.0,3.0,E


In [5]:
# Quick data prep

# kita ingin mengolah dataset ini di domain timeseries
from datetime import datetime
df['datetime'] = [datetime.strptime(x,'%d-%m-%Y') for x in df['Tanggal']] # change string 'Tanngal' to timeseries format

# change datetime as index; 
df = df.set_index('datetime')
df.drop('Tanggal', axis=1, inplace=True)

# change 'ddd_car' to categorical
df['ddd_car'] = pd.Categorical(df.ddd_car)

# renaming columns
df.rename(columns={"RR": "curah_hujan", "Tavg": "temp_avg", "RH_avg": "humid_avg", 
                   "ss": "sun_expo", "ff_x": "wind_max", "ff_avg": "wind_avg", "Tn": "temp_min", 
                   "Tx": "temp_max", "ddd_car": "wind_dir", "ddd_x": "wind_dir_max"}, inplace=True)



# Menambahkan kolom 'Year', 'Month', 'Weekday' untuk mempermudah visualisasi dan statiscical need
df['Year'] = df.index.year
df['Month'] = df.index.month
df['Weekday'] = df.index.weekday

In [6]:
# Dividing 1 dataset to 3 periode (just for easier visualization & statistical description)
# 2018-10 ~ 2019-03
# 2019-10 ~ 2020-03
# 2020-10 ~ 2021-03

df_periode1 = df.loc['2018-10-01':'2019-03-01']
df_periode2 = df.loc['2019-10-01':'2020-03-01']
df_periode3 = df.loc['2020-10-01':'2021-03-01']

In [None]:
df_periode1.head(3)

### Exploratory Data Analysis

In [None]:
#### Curah Hujan

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(14,12))

axes[0].plot(df_periode1['curah_hujan'], linestyle=':')
axes[0].set_title('Periode 2018-10 ~ 2019-03')

axes[1].plot(df_periode2['curah_hujan'], linestyle=':')
axes[1].set_title('Periode 2019-10 ~ 2020-03')

axes[2].plot(df_periode3['curah_hujan'], linestyle=':')
axes[2].set_title('Periode 2020-10 ~ 2021-03')

plt.tight_layout()

##### Curah hujan per periode

In [None]:
# Note : Urutan bulan boxplots dibawah dimulai dari yang terkecil ke terbesar. Keep that in mind when analyzing.
# Mau diubah ke 10-11-12-01-02-03 tapi masih belum bisa

In [None]:
df_periode1.boxplot(column=['curah_hujan'], by='Month', figsize=(8, 4))
plt.title('Curah Hujan periode 2018-10 ~ 2019-03')
plt.tight_layout()

df_periode2.boxplot(column=['curah_hujan'], by='Month', figsize=(8, 4))
plt.title('Curah Hujan periode 2019-10 ~ 2020-03')
plt.tight_layout()

df_periode3.boxplot(column=['curah_hujan'], by='Month', figsize=(8, 4))
plt.title('Curah Hujan periode 2020-10 ~ 2021-03')
plt.tight_layout()


In [None]:
def pp(judul, *args):
    sns.pairplot(df, x_vars=args, y_vars=['curah_hujan'], height=4, aspect=1, kind='scatter', plot_kws={'alpha':0.4})
    plt.suptitle(judul)
    plt.tight_layout()
    plt.show()
    
pp('Suhu vs Curah Hujan', 'temp_min', 'temp_max', 'temp_avg')
pp('Kecepatan Angin vs Curah Hujan', 'wind_max', 'wind_avg')
pp('Arah mata angin vs Curah Hujan', 'wind_dir_max')
pp('Kelembapan vs Curah Hujan', 'humid_avg')
pp('Lamanya penyinaran matahari vs Curah Hujan', 'sun_expo')

### Data Preparation (data cleaning)

### Before Modeling

#### Train-Validation Split

In [7]:
# just a temporary dropna, soalnya belum di cleaning
print(f"Before dropna : {df.shape}")
df.dropna(axis=0, inplace=True)
print(f"After dropna : {df.shape}")

Before dropna : (547, 13)
After dropna : (312, 13)


In [None]:
# Here we use 8 features, temporary

X = df.loc[:, ['temp_min', 'temp_max', 'temp_avg', 'humid_avg', 'wind_max','wind_avg', 'wind_dir_max', 'sun_expo']]
y = df.loc[:, ['curah_hujan']]

In [8]:
# COBA

X = df.loc[:, ['temp_min', 'temp_max', 'temp_avg', 'humid_avg', 'wind_max','wind_avg', 'sun_expo']]
y = df.loc[:, ['curah_hujan']]

#### Feature Scaling

Kenapa saya melakukan Feature scaling setelah train-test split?

"The fit method is calculating the mean and variance of each of the features present in our data. 
The transform method is transforming all the features using the respective mean and variance.

Now, we want scaling to be applied to our test data too and at the same time do not want to be biased with our model. 
We want our test data to be a completely new and a surprise set for our model. 
The transform method helps us in this case."

sumber : https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe

### Feature Engineering

In [None]:
# Feature Selection

In [15]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
y = y.values.ravel()
model.fit(X, y)


ValueError: Unknown label type: 'continuous'

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y.values.ravel())


In [None]:
# melakukan plot dari feature importances
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

### Modeling

#### Regresi Linear (without feature scaling)

Untuk melakukan regresi linear, kita perlu mengetahui independent variable mana yang paling berpengaruh ke dependent variabel (curah_hujan)

In [None]:
plt.figure(figsize=(9, 9))
sns.heatmap(pd.concat([X, y], axis=1).corr() ,annot=True ,fmt=".2f").set_title("Korelasi Heatmap Calon Variabel X")
plt.show()

Dari hasil visualisasi diatas, dapat diketahui humid_avg memiliki korelasi yang tinggi terhadap kolom curah_hujan / variabel dependent sehingga kita mengambil fitur/kolom humid_avg untuk di training

    Independent variabel(X) adalah humid_avg.
    Dependent variabel(y) adalah curah_hujan.



In [None]:
# Train-Validation Split
from sklearn.model_selection import train_test_split

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X.iloc[:, 3].values.reshape(-1, 1), y.values.reshape(-1,1), train_size=0.7, random_state = 100)

In [None]:
from sklearn import linear_model

regressor = linear_model.LinearRegression()
regressor.fit(X_train_reg, y_train_reg)

In [None]:
print(regressor.coef_)
print(regressor.intercept_)

In [None]:
regressor.score(X_test_reg, y_test_reg)

Model kita mendapatkan accuracy score sebesar 19.08%

In [None]:
y_prediksi = regressor.predict(X_test_reg)

plt.scatter(X_test_reg, y_test_reg)
plt.plot(X_test_reg, y_prediksi, c='r')
plt.xlabel('Kelembapan rata-rata (%)')
plt.ylabel('Curah hujan (mm)')
plt.title('Plot Kelembapan rata-rata vs Curah Hujan')

###### Decision Tree (without feature scaling)

In [None]:
X = df.loc[:, ['humid_avg']].values
y = df.loc[:, ['curah_hujan']].values

In [None]:
# Train-Validation Split
# Using average humidity
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state = 100)

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

In [None]:
# Visualization
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color='blue', alpha=0.3)
plt.plot(X_grid, regressor.predict(X_grid), color='red')
plt.title('Kelembapan rata-rata vs Curah hujan')
plt.xlabel('Kelembapan rata rata (%)')
plt.ylabel('Curah hujan (mm)')

In [None]:
#from sklearn import tree

plt.figure(figsize=(20, 20))
tree.plot_tree(regressor, filled = True)
plt.show()

##### Random Forest