In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
kabupaten_search = 'tegal'

In [None]:
#Load in the dataset
df_kab = pd.read_csv("/content/drive/MyDrive/Project Capstone_IL/Time-Series/kab_tegal (1).csv")

In [None]:
df_kab

In [None]:
df_kab['Date'] = pd.to_datetime(dict(year=df_kab.YEAR, month=df_kab.MO, day=df_kab.DY))

In [None]:
df_kab

In [None]:
#finding the columns with missing values
for i in range (12):
  if df_kab.isnull().any()[i]==True:
    print(df_kab.columns[i])

In [None]:
df_kab.isnull().sum()

In [None]:
df_kab.shape

In [None]:
df_kab.describe()

In [None]:
#Columns having less than 1% missing values can simply have their rows ignored
df_kab = df_kab[df_kab['T2M_MAX'].notna()]
df_kab = df_kab[df_kab['T2M_MIN'].notna()]
df_kab = df_kab[df_kab['T2M_RANGE'].notna()]
df_kab = df_kab[df_kab['RH2M'].notna()]
df_kab = df_kab[df_kab['PRECTOTCORR'].notna()]
df_kab = df_kab[df_kab['WS10M_MAX'].notna()]
df_kab = df_kab[df_kab['WS10M_MIN'].notna()]
df_kab = df_kab[df_kab['WS10M_RANGE'].notna()]

In [None]:
df_kab.shape

In [None]:
df_kab.info()

In [None]:
drop_cols = ["YEAR","MO","DY","T2M_RANGE","WS10M_RANGE"]
df_kab.drop(drop_cols, axis='columns', inplace=True)

In [None]:
df_kab.info()

In [None]:
df_kab.head()

In [None]:
df_kab.shape

## Merge Dataset

In [None]:
df_target = pd.read_csv("/content/drive/MyDrive/Project Capstone_IL/Time-Series/harga Kabupaten Tegal.csv")

In [None]:
df_target.head()

In [None]:
df_target.info()

In [None]:
df_target['Date'] = pd.to_datetime(df_target['Date'])

In [None]:
df_target.info()

In [None]:
df_target.head()

In [None]:
df_target.shape

In [None]:
df=pd.merge(df_target,df_kab, how='inner')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
data = df.copy()

In [None]:
#data.set_index("Date",inplace=True)

In [None]:
data.tail()

In [None]:
df.to_csv('data_bawang.csv', index=False)

# **Exploratory Data Analysis**

### **Statistik Deskriptif**

In [None]:
# Statistik Deskriptif
print(data.describe())

### **Distribusi Setiap Variabel**

In [None]:
# Visualisasi histogram
plt.figure(figsize=(12, 8))
for column in data.columns[1:]:
    plt.subplot(2, 4, data.columns.get_loc(column))
    sns.histplot(data[column], bins=20, kde=True)
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.show()

### **Korelasi antar variabel**

In [None]:
# Correlation Heatmap
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

### **Grafik Harga Bawang Terhadap Waktu**

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
plt.figure(figsize=(12, 6))
plt.plot(data['value_L'], label='Harga Bawang', color='blue')
plt.title('Harga Bawang Over Time')
plt.xlabel('Date')
plt.ylabel('Harga Bawang')
plt.legend()
plt.show()

### **Distribusi Frekuensi Harga Bawang**

In [None]:
# Distribusi Data value_L
plt.figure(figsize=(10, 6))
sns.histplot(data["value_L"], bins=20, kde=True)
plt.title("Distribusi Data value_L")
plt.xlabel("value_L")
plt.ylabel("Frekuensi")
plt.show()

In [None]:
# Box Plot untuk value_L
plt.figure(figsize=(8, 4))
sns.boxplot(x="value_L", data=data)
plt.title("Box Plot value_L")
plt.xlabel("value_L")
plt.show()

### **Rata-rata Harga Bawang Berdasarkan Bulan dan Tahun**

In [None]:
# Ekstrak tahun dan bulan dari kolom "Date"
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month

In [None]:
# Agregasi data per bulan
monthly_data = data.groupby(["Year", "Month"])["value_L"].mean().reset_index()

In [None]:
# Agregasi data per tahun
annual_data = data.groupby("Year")["value_L"].mean().reset_index()

In [None]:
# Visualisasi data per bulan
plt.figure(figsize=(12, 6))
sns.lineplot(x="Month", y="value_L", hue="Year", data=monthly_data)
plt.title("Rata-rata Harga per Bulan")
plt.xlabel("Bulan")
plt.ylabel("Rata-rata Harga")
plt.legend(title="Tahun")
plt.show()

In [None]:
# Visualisasi data per tahun
plt.figure(figsize=(10, 5))
sns.lineplot(x="Year", y="value_L", data=annual_data)
plt.title("Rata-rata Harga per Tahun")
plt.xlabel("Tahun")
plt.ylabel("Rata-rata Harga")
plt.show()

## **Cek Dickey Fuller**

Cek Dickey-Fuller adalah alat statistik yang digunakan untuk menguji hipotesis tentang stasioneritas dalam deret waktu. Hasil dari tes ini membantu kita membuat keputusan tentang apakah perlu melakukan differencing (pemutakhiran) atau transformasi lainnya pada data sebelum menerapkan model peramalan.

In [None]:
from statsmodels.tsa.stattools import adfuller
#apply first order differencing to get p-value<0.05
for col in df.columns:
  ad_fuller_result= adfuller(df[col].diff()[1:])
  print(col)
  print(f'ADF Statistic: {ad_fuller_result[0]}')
  print(f'p-value: {ad_fuller_result[1]}')
  print('-----------------------------------')

    value_L
    Nilai ADF Statistic adalah -28.928312803158903.
    Nilai p-value adalah 0.0.

Dalam kasus ini, p-value sangat rendah (0.0), lebih rendah dari tingkat signifikansi yang umumnya digunakan (biasanya 0.05). Oleh karena itu, memiliki cukup bukti statistik untuk menolak hipotesis nol, yang berarti dapat menyimpulkan bahwa **"value_L" adalah deret waktu yang stasioner.**