In [None]:
# impot library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Get Data
url = 'https://raw.githubusercontent.com/rawisara01/DSI314/main/Gradient%20boosting%20classifier/combined_files.csv'
df =pd.read_csv(url)

In [None]:
df

In [None]:
# ลบคอลัมน์ที่ไม่ได้ใช้งาน
df.drop(['Unnamed: 0', 'ปีที่เกิดเหตุ', 'ลักษณะการเกิดอุบัติเหตุ', 'จำนวนรถที่เกิดเหตุ (รวมคันที่ 1)', 'จังหวัด'], inplace=True, axis=1)

In [None]:
#เปลี่ยนชื่อคอลัมน์
df.rename(columns = {'วันที่เกิดเหตุ':'date_acc', 
                     'เวลา':'time_acc', 
                     'สายทาง':'route_acc', 
                     'ประเภทรถ':'type_acc', 
                     'บริเวณที่เกิดเหตุ/ลักษณะทาง':'scene/road surface', 
                     'จำนวนผู้เสียชีวิต':'num_death', 
                     'รวมจำนวนผู้บาดเจ็บ':'num_injured', 
                     'สภาพอากาศ':'climate_acc'}, inplace = True)

In [None]:
df.info()

#Data Preprocessing

In [None]:
# แยก date_acc และสร้างคอลัมน์ date_acc, 'month_acc', 'years_acc'
df[['day_acc', 'month_acc', 'years_acc']] = df['date_acc'].str.split('/', expand=True)
df = df.drop(columns=["date_acc"])

# แยก time_acc และสร้างคอลัมน์ hours_acc, minutes_acc 
df[['hours_acc', 'minutes_acc']] = df['time_acc'].str.split(':', expand=True)
df = df.drop(columns=["time_acc"])

# แยก scene/road surface และสร้างคอลัมน์ 'scene_acc', 'road_surface'
df[['scene_acc', 'road_surface']] = df['scene/road surface'].str.split('+', expand=True)
df = df.drop(columns=["scene/road surface"])
df

In [None]:
df.info()

In [None]:
# แปลงข้อมูลให้เป็น int ตามความเหมาะสม
df[['hours_acc', 'minutes_acc']] = df[['hours_acc', 'minutes_acc']].astype(str).astype(int)

In [None]:
# สร้างเงื่อนไขสำหรับช่วงเวลา
conditions_time = [
    (df['hours_acc'] >= 0) & (df['hours_acc'] <= 4),
    (df['hours_acc'] >= 5) & (df['hours_acc'] <= 9),
    (df['hours_acc'] >= 10) & (df['hours_acc'] <= 14),
    (df['hours_acc'] >= 15) & (df['hours_acc'] <= 19),
    (df['hours_acc'] >= 20) & (df['hours_acc'] <= 24)
    ]

# แบ่งช่วงเวลา
values_time = ['0', '1', '2', '3', '4']

# สร้างคอลัมน์ time
df['time_acc'] = np.select(conditions_time, values_time)

In [None]:
# สร้างเงื่อนไขสำหรับระดับความรุนแรง
conditions_acc = [
    (df['num_injured'] <= 0) & (df['num_death'] <= 0),
    (df['num_injured'] > 0) & (df['num_death'] <= 0),
    (df['num_injured'] >= 0) & (df['num_death'] > 0),
    ]

# แบ่งระดับความรุนแรง
values_acc = [ 'ACC', 'INJ', 'DIE' ]

# สร้างคอลลัมน์ Level_acc
df['level_acc'] = np.select(conditions_acc, values_acc)

df = df.drop(columns=["num_injured"])
df = df.drop(columns=["num_death"])

In [None]:
df.dtypes

--------------------------------------------------------------------

#Exploratory Data Analysis (EDA)

In [None]:
# copy df to df_EDA
df_EDA = df.copy()

# ลบคอลัมน์ที่ไม่ใช้งาน
df_EDA.drop('route_acc', inplace=True, axis=1)

In [None]:
# โหลด font ภาษาไทยสำหรับการ plot กราฟ
!wget -q https://github.com/Phonbopit/sarabun-webfont/raw/master/fonts/thsarabunnew-webfont.ttf
 
import matplotlib as mpl
mpl.font_manager.fontManager.addfont('thsarabunnew-webfont.ttf')
mpl.rc('font', family='TH Sarabun New')

In [None]:
# สร้างฟังก์ชันสำหรับกราฟแท่ง
def val_count_BarPlot(data=None, count_cols=None):
      data_count = data[count_cols].value_counts()
      fig, ax = plt.subplots(figsize=(8, 6))
      pbar = ax.bar(data_count.index.astype(str), data_count.values)
      # ax.bar_label(pbar, [num for num in data_count.values])
      ax.set_title(f"{count_cols} values count")
      plt.show

In [None]:
# กราฟแท่งของระดับความรุนแรง
val_count_BarPlot(data=df_EDA, count_cols='level_acc')

In [None]:
# จำนวนของระดับความรุนแรง
df_EDA.level_acc.value_counts()

In [None]:
#กราฟแท่งของคอลัมน์ climate
val_count_BarPlot(data=df_EDA, count_cols='climate_acc')

# climate เทียบกับ level_acc
for val in list(df['climate_acc'].unique()):
  print(f"{val} level_acc")
  print(df.loc[df['climate_acc']==val]['level_acc'].value_counts(),"\n")

In [None]:
#กราฟแท่งของคอลัมน์ day_acc
val_count_BarPlot(data=df_EDA, count_cols='day_acc')

# day_acc เทียบกับ level_acc
for val in list(df['day_acc'].unique()):
  print(f"{val} level_acc")
  print(df.loc[df['day_acc']==val]['level_acc'].value_counts(),"\n")

In [None]:
#กราฟแท่งของคอลัมน์ month_acc
val_count_BarPlot(data=df_EDA, count_cols='month_acc')

# month_acc เทียบกับ level_acc
for val in list(df['month_acc'].unique()):
  print(f"{val} level_acc")
  print(df.loc[df['month_acc']==val]['level_acc'].value_counts(),"\n")

In [None]:
#กราฟแท่งของคอลัมน์ years_acc
val_count_BarPlot(data=df_EDA, count_cols='years_acc')

# years_acc เทียบกับ level_acc
for val in list(df['years_acc'].unique()):
  print(f"{val} level_acc")
  print(df.loc[df['years_acc']==val]['level_acc'].value_counts(),"\n")

In [None]:
#กราฟแท่งของคอลัมน์ type_acc
type_acc = df_EDA['type_acc'].value_counts().head(10).index
plt.figure(figsize=(15, 8))
sns.countplot(data=df_EDA, y='type_acc', order=type_acc)
plt.xlabel('จำนวนรถ (คัน)')
plt.ylabel('ประเภทรถ');

In [None]:
# type_acc เทียบกับ level_acc
for val in list(df['type_acc'].unique()):
  print(f"{val} level_acc")
  print(df.loc[df['type_acc']==val]['level_acc'].value_counts(),"\n")

In [None]:
#กราฟแท่งของคอลัมน์ scene_acc
scene_acc = df_EDA['scene_acc'].value_counts().head(10).index
plt.figure(figsize=(15, 8))
sns.countplot(data=df_EDA, y='scene_acc', order=scene_acc)
plt.xlabel('จำนวนลักษณะทาง (จำนวน)')
plt.ylabel('ลักษณะทาง');

In [None]:
# scene_acc เทียบกับ level_acc
for val in list(df['scene_acc'].unique()):
  print(f"{val} level_acc")
  print(df.loc[df['scene_acc']==val]['level_acc'].value_counts(),"\n")

In [None]:
#กราฟแท่งของคอลัมน์ road_surface
val_count_BarPlot(data=df_EDA, count_cols='road_surface')

In [None]:
# road_surface เทียบกับ level_acc
for val in list(df['road_surface'].unique()):
  print(f"{val} level_acc")
  print(df.loc[df['road_surface']==val]['level_acc'].value_counts(),"\n")

In [None]:
#กราฟแท่งของคอลัมน์ time_acc
val_count_BarPlot(data=df_EDA, count_cols='time_acc')

In [None]:
# time_acc เทียบกับ level_acc
for val in list(df['time_acc'].unique()):
  print(f"{val} level_acc")
  print(df.loc[df['time_acc']==val]['level_acc'].value_counts(),"\n")

---------------------------------------------

### Heatmap (correlation)

In [None]:
# copy df ลงใน Htm
df_Htm = df.copy()

In [None]:
df_Htm

In [None]:
# ลบคอลัมน์ที่ไม่ใช้งาน
df_Htm.drop('route_acc', inplace=True, axis=1)

In [None]:
# ตรวจสอบชนิดข้อมูลเบื้องต้น
df_Htm.dtypes

In [None]:
from sklearn.preprocessing import LabelEncoder
#create instance of label encoder
lab = LabelEncoder()

# encode
df_Htm['type_acc'] = lab.fit_transform(df_Htm['type_acc'])
df_Htm['climate_acc'] = lab.fit_transform(df_Htm['climate_acc'])
df_Htm['scene_acc'] = lab.fit_transform(df_Htm['scene_acc'])
df_Htm['road_surface'] = lab.fit_transform(df_Htm['road_surface'])
df_Htm['level_acc'] = lab.fit_transform(df_Htm['level_acc'])

In [None]:
# แปลงข้อมูลให้เป็น int
df_Htm[['day_acc', 'month_acc', 'years_acc', 'time_acc']] = df_Htm[['day_acc', 'month_acc', 'years_acc', 'time_acc']].astype(str).astype(int)

In [None]:
df_Htm.dtypes

In [None]:
# ดูค่า corr()
df_Htm.corr()

In [None]:
# plot heatmap
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(df_Htm.corr(), annot=True, cmap="plasma")
plt.show

In [None]:
# ลบคอลัมน์ทั้งสองทิ้งเนื่องจากไม่ได้มีความสัมพันธ์มากนัก
df = df.drop(columns=["hours_acc"])
df = df.drop(columns=["minutes_acc"])

--------------------------------

#Data preprocessing และ เปรียบเทียบ model 

In [None]:
# copy df to df_select
df_select = df.copy()

# ลบคอลัมน์ที่ไม่ใช้งาน
df_select.drop(columns=["route_acc"], inplace=True, axis=1)

In [None]:
df_select

In [None]:
df_select.info()

In [None]:
# แปลงข้อมูลให้เป็น int
df_Htm[['day_acc', 'month_acc', 'years_acc', 'time_acc']] = df_Htm[['day_acc', 'month_acc', 'years_acc', 'time_acc']].astype(str).astype(int)

In [None]:
from sklearn.preprocessing import LabelEncoder

#create instance of label encoder
lab = LabelEncoder()

# สร้างคอลัมน์สำหรับการ encode
df_select['type_acc'] = lab.fit_transform(df_select['type_acc'])
df_select['climate_acc'] = lab.fit_transform(df_select['climate_acc'])
df_select['scene_acc'] = lab.fit_transform(df_select['scene_acc'])
df_select['road_surface'] = lab.fit_transform(df_select['road_surface'])
df_select

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [None]:
# เตรียมชุดข้อมูลสำหรับทำ model
x = df_select.drop(columns=["level_acc"])
y = df_select['level_acc']

In [None]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.4, random_state=42)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
# Logistic Regression
lg = LogisticRegression()
lg.fit(x_train,y_train)
lg_pred_score = lg.score(x_test,y_test)

In [None]:
# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
rf_pred_score = rf.score(x_test,y_test)

In [None]:
# Gradient Boosting Classifier
gb = GradientBoostingClassifier()
gb.fit(x_train,y_train)
gb_pred_score = gb.score(x_test,y_test)

In [None]:
gb.feature_importances_

In [None]:
importances = gb.feature_importances_
columns = x.columns
i = 0

while i<len(columns):
    print(f" The importance of feature '{columns[i]}' is {round(importances[i]*100, 2)}%.")
    i += 1

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train,y_train)
svc_pred_score = svc.score(x_test,y_test)

In [None]:
df_select = pd.DataFrame(dict(model=['Logistic Regression', 
                              'Random Forest',
                              'Gradient Boosting',
                              'SVM'],accuracy=[lg_pred_score, rf_pred_score,
                                               gb_pred_score, svc_pred_score]))

In [None]:
df_select

In [None]:
df_select.plot(kind='bar',x='model',y='accuracy',title='Model Accuracy',legend=False,
        color=['#1F77B4', '#FF7F0E', '#2CA02C'])
plt.ylim(0.5,1);

--------------------------

# Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

In [None]:
# copy df to df_select
df_model = df.copy()

# ลบคอลัมน์ที่ไม่ใช้งาน
df_model.drop(columns=["route_acc"], inplace=True, axis=1)

In [None]:
df_model

In [None]:
from sklearn.preprocessing import LabelEncoder

#create instance of label encoder
lab = LabelEncoder()

# สร้างคอลัมน์สำหรับการ encode
df_model['type_acc'] = lab.fit_transform(df_model['type_acc'])
df_model['climate_acc'] = lab.fit_transform(df_model['climate_acc'])
df_model['scene_acc'] = lab.fit_transform(df_model['scene_acc'])
df_model['road_surface'] = lab.fit_transform(df_model['road_surface'])

In [None]:
# เตรียมชุดข้อมูลสำหรับทำ model
X= df_model.drop(columns=["level_acc"])
Y = df_model['level_acc']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.4, random_state=42)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [None]:
# train with Gradient Boosting algorithm
# compute the accuracy scores on train and test sets when training with different learning rates

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, Y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, Y_train)))
    print("Accuracy score (test): {0:.3f}".format(gb.score(X_test, Y_test)))
    print()

In [None]:
# Learning rate = 0.25
# Output confusion matrix and classification report of Gradient Boosting algorithm on test set

gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.25, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, Y_train)
predictions = gb.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(Y_test, predictions))
print()
print("Classification Report")
print(classification_report(Y_test, predictions))

-------------------------------

# Accident prediction

In [None]:
def input_data(type_acc,climate_acc,day_acc,month_acc,years_acc,scene_acc,road_surface,time_acc):
  df1 = pd.DataFrame({'type_acc': [type_acc],
                   'climate_acc': [climate_acc],
                   'day_acc': [day_acc],
                   'month_acc':[month_acc],
                   'years_acc': [years_acc],
                   'scene_acc': [scene_acc],
                   'road_surface': [road_surface],
                   'time_acc': [time_acc]})
  return df1

In [None]:
def clean_data2(df1):
  clean_df = input

  lab = LabelEncoder()
  clean_df['type_acc'] = lab.fit_transform(clean_df['type_acc'])
  clean_df['climate_acc'] = lab.fit_transform(clean_df['climate_acc'])
  clean_df['scene_acc'] = lab.fit_transform(clean_df['scene_acc'])
  clean_df['road_surface'] = lab.fit_transform(clean_df['road_surface'])

  return clean_df

In [None]:
#@markdown <h3> กรอกข้อมูลการเกิดอุบัติเหตุ </h3>
type_acc = "\u0E23\u0E16\u0E1B\u0E34\u0E04\u0E2D\u0E31\u0E1E\u0E42\u0E14\u0E22\u0E2A\u0E32\u0E23" #@param ['รถยนต์นั่งส่วนบุคคล/รถยนต์นั่งสาธารณะ', 'รถปิคอัพบรรทุก 4 ล้อ','รถจักรยานยนต์','รถบรรทุกมากกว่า 10 ล้อ (รถพ่วง)','รถบรรทุก 6 ล้อ','รถบรรทุกมากกว่า 6 ล้อ ไม่เกิน 10 ล้อ','รถตู้','รถโดยสารขนาดใหญ่','รถปิคอัพโดยสาร','อื่นๆ']{allow-input: true}                            
climate_acc =  "\u0E41\u0E08\u0E48\u0E21\u0E43\u0E2A"#@param ['แจ่มใส', 'ฝนตก', 'มืดครึ้ม', 'อื่นๆ'] {allow-input: true}
day_acc    =  "24"#@param['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12','13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23','24', '25', '26', '27', '28', '29', '30', '31'] {allow-input: true}
month_acc =  "7"#@param ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'] {allow-input: true}
years_acc =  "2022"#@param['2019', '2020', '2021', '2022']{allow-input: true}
scene_acc	 =  "\u0E17\u0E32\u0E07\u0E15\u0E23\u0E07"#@param['อื่นๆ', 'ทางตรง', 'ทางโค้งกว้าง', 'ทางแยกต่างระดับ/Ramps','ทางเชื่อมเข้าพื้นที่สาธารณะ/เชิงพาณิชย์', 'ทางโค้งหักศอก','ทางสามแยก (Y)', 'ทางร่วม', 'ทางสามแยก (T)', 'จุดกลับรถต่างระดับ','ทางเชื่อมเข้าพื้นที่ส่วนบุคคล', 'ทางสี่แยก'] {allow-input: true}
road_surface	 = "\u0E44\u0E21\u0E48\u0E21\u0E35\u0E04\u0E27\u0E32\u0E21\u0E25\u0E32\u0E14\u0E0A\u0E31\u0E19" #@param ['ไม่มีความลาดชัน', 'ที่ลาดชัน'] {allow-input: true}
time_acc = "4"#@param['0', '1', '2', '3', '4'] {allow-input: true}
# Rental_Yield	 =  3.75#@param {type:"number"} {allow-input: true}
input = input_data(type_acc, climate_acc, day_acc ,month_acc, years_acc , scene_acc,road_surface, time_acc )
clean_input = clean_data2(input)
# clean_df2=clean_df.drop('Sale_Price_Sqm', axis=1).copy()
# clean_input1= clean_input1.reindex(labels=clean_df2.columns,axis=1)
# clean_input2 = clean_input1.fillna(0)
# clean_input3 = scaler.transform(clean_input2)
predictions = gb.predict(clean_input)
# predict = rdfr2.predict(clean_input3)
print('ระดับความรุนแรงของอุบัติเหตุ =', predictions)

----------------

#เตรียมชุดข้อมูลสำหรับการทำ Association Rule Mining

In [None]:
df_ARM = df.copy()

In [None]:
# Filter all rows with  level_acc 
df_ARM1 = df_ARM.query(" level_acc == 'DIE' ")
df_ARM2 = df_ARM.query(" level_acc == 'INJ' ")
df_ARM3 = df_ARM.query(" level_acc == 'ACC' ")

In [None]:
df_ARM1

In [None]:
# ลบคอลัมน์ที่ไม่ใช้งาน
df_ARM1.drop('level_acc', inplace=True, axis=1)
df_ARM2.drop('level_acc', inplace=True, axis=1)
df_ARM3.drop('level_acc', inplace=True, axis=1)

In [None]:
from google.colab import files
df_ARM1.to_csv('level_acc_DIE.csv') 
files.download('level_acc_DIE.csv')

In [None]:
df_ARM2.to_csv('level_acc_INJ.csv') 
files.download('level_acc_INJ.csv')

In [None]:
df_ARM3.to_csv('level_acc_ACC.csv') 
files.download('level_acc_ACC.csv')