Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

Proses Scrapping Data

In [2]:
all_df = []

for i in range (1993, 2026):
    url = f'https://lottery.hk/en/mark-six/results/{i}'
    
    try:
        data = pd.read_html(url)
        data = data[0]
        all_df.append(data)

        print(f'Success Collected {data.shape[0]} Data {i}')
    except Exception as e:
        print(f'Error: {e}')

print(f'Scrapping done!' )


Success Collected 113 Data 1993
Success Collected 113 Data 1994
Success Collected 113 Data 1995
Success Collected 118 Data 1996
Success Collected 116 Data 1997
Success Collected 126 Data 1998
Success Collected 122 Data 1999
Success Collected 124 Data 2000
Success Collected 122 Data 2001
Success Collected 125 Data 2002
Success Collected 129 Data 2003
Success Collected 152 Data 2004
Success Collected 167 Data 2005
Success Collected 166 Data 2006
Success Collected 164 Data 2007
Success Collected 161 Data 2008
Success Collected 166 Data 2009
Success Collected 164 Data 2010
Success Collected 166 Data 2011
Success Collected 164 Data 2012
Success Collected 164 Data 2013
Success Collected 164 Data 2014
Success Collected 164 Data 2015
Success Collected 163 Data 2016
Success Collected 165 Data 2017
Success Collected 161 Data 2018
Success Collected 156 Data 2019
Success Collected 36 Data 2020
Success Collected 135 Data 2021
Success Collected 123 Data 2022
Success Collected 158 Data 2023
Success C

Penggabungan Data Hasil Scrapping

In [3]:
if all_df:
    df = pd.concat(all_df, ignore_index=True)
    print(df.shape)
else:
    print("Tidak ada data yang tersedia")

(4668, 4)


In [4]:
df.head()

Unnamed: 0,Draw Number,Draw Date,Balls Drawn,Details
0,December 1993,December 1993,December 1993,December 1993
1,93/101,30/12/1993,14 16 23 26 32 33 10,
2,93/100,28/12/1993,3 5 12 27 28 39 9,
3,93/099,23/12/1993,2 16 17 21 24 32 20,
4,93/098,16/12/1993,7 17 18 20 24 28 32,


Proses Menjadikan List ke Dataframe

In [5]:
df = pd.DataFrame(df, columns=["Draw Number", "Draw Date", "Balls Drawn", "Detail"])

Pengubahan Struktur pada Kolom Date  

In [6]:
df['Draw Date'] = pd.to_datetime(df['Draw Date'], format='%d/%m/%Y', errors='coerce')

Proses Menghilangkan Baris Tidak Penting

In [7]:
df = df[~df['Balls Drawn'].str.contains('January|February|March|April|May|June|July|August|September|October|November|December', case=False, na=False)]
df = df.drop(columns=['Detail', 'Draw Number'])

One Hot Encoding pada Fitur Ball Draws

In [8]:
df['Balls_List'] = df['Balls Drawn'].apply(lambda x: [int(n) for n in x.split()])
mlb = MultiLabelBinarizer(classes=range(1, 50))

balls_encoded = mlb.fit_transform(df['Balls_List'])

balls_onehot = pd.DataFrame(
    balls_encoded,
    columns=[f'num_{i}' for i in range(1, 50)],
    index=df.index
)

balls_onehot['ball_draws'] = df['Balls Drawn']

df = pd.concat([df[['Draw Date']], balls_onehot], axis=1)


Pemisahan Special Number & Regular Number

In [9]:
split_cols = df['ball_draws'].str.split(expand=True)

df['special_num'] = split_cols.iloc[:, -1]
df['ball_draws'] = split_cols.iloc[:, :-1].apply(lambda x: ' '.join(x.dropna()), axis=1)

Pengubahan Nama Draw Date

In [10]:
df = df.rename(columns={'Draw Date':'date'})
df['special_num'] = pd.to_numeric(df['special_num'])

Proses Sorting Berdasarkan Date

In [11]:
df.sort_values(by='date', ascending=True, inplace=True)

Feauture Extraction pada Kolom Date

In [12]:
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_year'] = df['date'].dt.dayofyear

# Transformasi Sin/Cos untuk Bulan
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Transformasi Sin/Cos untuk Hari
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

In [13]:
df.drop(columns=['month', 'day_of_week'], inplace=True)

Penyetelan Ulang Index Dataset

In [14]:
df = df.set_index('date')

Features Extraction untuk Menentukan Frekuensi Bola yang Keluar

In [15]:
# One-Hot Encoding
main_balls_list = df['ball_draws'].str.split(' ').apply(lambda x: [int(n) for n in x])
special_ball_list = df['special_num'].apply(lambda x: [int(x)])
all_numbers_list = main_balls_list + special_ball_list

mlb = MultiLabelBinarizer(classes=range(1, 50))
y_array = mlb.fit_transform(all_numbers_list)
y_binary = pd.DataFrame(y_array, columns=mlb.classes_, index=df.index)

# SHORT TERM TREND
short_term = y_binary.rolling(window=10).sum().shift(1).fillna(0)
short_term.columns = [f'b{i}_last10' for i in range(1, 50)]

# MEDIUM TERM TREND 
medium_term = y_binary.rolling(window=50).sum().shift(1).fillna(0)
medium_term.columns = [f'b{i}_last50' for i in range(1, 50)]

df = df.join(short_term).join(medium_term)

freq_cols = [c for c in df.columns if 'last' in c]
df[freq_cols] = df[freq_cols].astype(int)

print(f"Dimensi: {df.shape}")
df.head()

Dimensi: (4279, 154)


Unnamed: 0_level_0,num_1,num_2,num_3,num_4,num_5,num_6,num_7,num_8,num_9,num_10,...,b40_last50,b41_last50,b42_last50,b43_last50,b44_last50,b45_last50,b46_last50,b47_last50,b48_last50,b49_last50
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-01-05,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1993-01-07,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1993-01-12,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1993-01-14,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1993-01-19,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Melihat Kolom

In [16]:
df.columns

Index(['num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8',
       'num_9', 'num_10',
       ...
       'b40_last50', 'b41_last50', 'b42_last50', 'b43_last50', 'b44_last50',
       'b45_last50', 'b46_last50', 'b47_last50', 'b48_last50', 'b49_last50'],
      dtype='object', length=154)

Penyetelan pada Kolom ball_draws dan special_num untuk Diposisikan di Bagian Paling Kiri

In [17]:
cols_to_move = ['ball_draws', 'special_num']

all_cols = df.columns.tolist()
other_cols = [c for c in all_cols if c not in cols_to_move]
new_order = cols_to_move + other_cols

df = df[new_order]

In [18]:
# Fitur (X) Date + Rolling Freq
feature_cols = [c for c in df.columns if 'last' in c or 'sin' in c or 'cos' in c]

# Target (y): _num_1 & num_49
target_cols = [c for c in df.columns if 'num_' in c]

print(f"Jumlah Fitur: {len(feature_cols)}")
print(f"Jumlah Target: {len(target_cols)}")

cutoff_date = '2024-01-01'

Jumlah Fitur: 102
Jumlah Target: 49


In [19]:
train_df = df[df.index < cutoff_date]
test_df = df[df.index >= cutoff_date]

X_train = train_df[feature_cols]
y_train = train_df[target_cols]

X_test = test_df[feature_cols]
y_test = test_df[target_cols]

print(f"Shape Training: {X_train.shape}")
print(f"Shape Testing: {X_test.shape}")


Shape Training: (4014, 102)
Shape Testing: (265, 102)


Export to CSV

In [20]:
train_df.to_csv('lottery_train.csv')
test_df.to_csv('lottery_test.csv')