In [None]:
#Project1 PM2.5 Forecasting (72 hours in advance)

https://docs.google.com/presentation/d/18KoEohzXjadHBcKxbkhgMuEXkSOlvcwJtEs5K1oPQa8/edit#slide=id.p1

*   6 stations in each region
*   Training data: 2016/03 – 2019/03
*   Testing data: 2019/04 – 2020/03
*   It is hourly data.
*   The forecast must be every 6 hours (12PM, 6PM, 12AM, 6AM). 
*   At each prediction, you must forecast 3 days in advance (72 data points).

You must combine at least 2 exogeneous variables into your model.

There are 4 types of data

1.   PM2.5 (hourly)
2.   Wind (every 3 hours)
3.   Temperature (every 3 hours)
4.   Fire hot spot (event based)
  *   There are 6 important features:
  *   Latitude, longitude
  *   ACQ_Date, ACQ_Time (You need to combine these two columns and convert to be UTC. Finally, you must use UTC+7 for Thailand.)
  *   FRQ, Bright

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt, time
import warnings
import math
import itertools
warnings.filterwarnings("ignore")
from datetime import datetime,timedelta
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from numpy import array
from keras.models import Sequential
# from keras.layers import LSTM
from keras.layers.convolutional import Conv1D    
from keras.layers import LSTM,Dense, Dropout, Activation, Bidirectional, Masking
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import mean_squared_error
from numpy.random import seed

Data description

1. PM2.5

   - ช่วงเวลา: 1 ชม.

   คำอธิบายคอลัมน์

   - PM2.5: ค่า pm2.5 (µg / m3)

   - วันที่ชั่วโมงและคุณลักษณะวันที่และเวลาอื่น ๆ ใน UTC (ดังนั้นต้องจัดให้เป็นเขตเวลาเอเชีย / กรุงเทพก่อนนำไปใช้ (+7))

2. ลม

   คำอธิบายคอลัมน์

   - Win dir: ปัดลม (องศา)

   - ความเร็วลม: ความเร็วลมที่ความสูง 850 hpa หรือ 1.5 กม.

   - lat, long: ละติจูด, ลองติจูด

   - วันที่และเวลาในโซนเวลาเอเชีย / กรุงเทพฯ

3. อุณหภูมิ

   คำอธิบายคอลัมน์

    - อุณหภูมิ: แบบที่ระดับพื้นผิว (° c)

    - lat, long: ละติจูด, ลองติจูด

    - วันที่และเวลาในโซนเวลาเอเชีย / กรุงเทพฯ

4. จุดไฟ

    คำอธิบายคอลัมน์

    - frp (พลังการแผ่รังสีไฟ): กำลังของ firehotspot (MW)

   - ความสว่าง: ความสว่างของ fire hotspot โดยจะมี bright_ti4, bright_ti5 ซึ่งหมายถึง chanel ที่แตกต่างกันในการวัด

  - lat, long: จุดอับดับเพลิง

  - acq_date, acq_time ใน UTC (ดังนั้นต้องเรียงให้เป็นเขตเวลา Asia / BKK ก่อนนำไปใช้)

In [None]:
def toDF(city,pm25_filename,temp_filename,wind_filename,plus):
    pm25 = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Train/'+pm25_filename, sep='\t', header=None, skiprows=10)
    pm25['Time'] = pd.to_datetime({'year': pm25[0], 'month': pm25[1], 'day':pm25[2],'hour': pm25[3]})
    pm25=pm25.drop([0,1,2,3,5,6],axis=1)
    pm25['Time'] = pm25['Time'].dt.tz_localize('UTC').dt.tz_convert('Asia/Bangkok')
    pm25['Time'] = pm25['Time'].dt.tz_localize(None)
    pm25.columns = ['PM25', 'Time']
    pm25 = pm25.set_index('Time')
    pm25 = pm25[~pm25.index.duplicated(keep='first')]
    pm25.index = pd.DatetimeIndex(pm25.index)
    pm25['PM25']+=plus
    pm25['PM25'][pm25['PM25'] < 0] = 0
    temp_df = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Train/'+temp_filename)
    temp_df['datetime'] = pd.to_datetime(temp_df['datetime'])
    temp_df.set_index(temp_df['datetime'],inplace=True)
    temp_df.drop(columns={'datetime','lat','long'},inplace=True)
    temp_df.columns = ['Temp']
    temp_df = temp_df.resample('h').ffill()

    wind_df = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Train/' + wind_filename)
    wind_df['datetime'] = pd.to_datetime(wind_df['datetime'])
    wind_df.set_index(wind_df['datetime'],inplace=True)
    wind_df.drop(columns={'datetime','lat','long'},inplace=True)
    wind_df.columns = ['WindDir', 'WindSpeed']
    wind_df = wind_df.resample('h').ffill()

    df = pm25.merge(temp_df,left_index=True,right_index=True)
    df = df.merge(wind_df,left_index=True,right_index=True)

    df = df[['Temp','WindSpeed','WindDir','PM25']]

    return df

In [None]:
def toDFtest(city,pm25_filename,temp_filename,wind_filename):
    pm25 = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Test/'+pm25_filename)
    pm25.columns = ['Time','PM25']
    pm25 = pm25.set_index('Time')
    pm25.index = pd.DatetimeIndex(pm25.index)

    temp_df = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Test/'+temp_filename)
    temp_df['datetime'] = pd.to_datetime(temp_df['datetime'])
    temp_df.set_index(temp_df['datetime'],inplace=True)
    temp_df.drop(columns={'datetime','lat','long'},inplace=True)
    temp_df.columns = ['Temp']
    temp_df = temp_df.resample('h').ffill()

    wind_df = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Test/'+wind_filename)
    wind_df['datetime'] = pd.to_datetime(wind_df['datetime'])
    wind_df.set_index(wind_df['datetime'],inplace=True)
    wind_df.drop(columns={'datetime','lat','long'},inplace=True)
    wind_df.columns = ['WindDir', 'WindSpeed']
    wind_df = wind_df.resample('h').ffill()

    df = pm25.merge(temp_df,left_index=True,right_index=True)
    df = df.merge(wind_df,left_index=True,right_index=True)
    df = df[['Temp','WindSpeed','WindDir','PM25']]

    return df

###Chanthaburi

In [None]:
df_Chanthaburi = toDF('Chanthaburi','Chanthaburi.txt','3H_temperature_Chanthaburi.csv','3H_wind_Chanthaburi.csv',0)
df_Chanthaburi

In [None]:
df_Chanthaburi_test = toDFtest('Chanthaburi','Chanthaburi (Thailand timezone).csv','3H_temperature_Chanthaburi.csv','3H_wind_Chanthaburi.csv')
df_Chanthaburi_test

###Chiangmai

In [None]:
df_Chiangmai = toDF('Chiang Mai','Chiang_Mai.txt','3H_temperature_Chiang Mai.csv','3H_wind_Chiang Mai.csv',0)
df_Chiangmai

In [None]:
df_Chiangmai_test = toDFtest('Chiang Mai','Chiang Mai (Thailand timezone).csv','3H_temperature_Chiang Mai.csv','3H_wind_Chiang Mai.csv')
df_Chiangmai_test

###Kanchanaburi

In [None]:
df_Kanchanaburi = toDF('Kanchanaburi','Kanchanaburi.txt','3H_temperature_Kanchanaburi.csv','3H_wind_Kanchanaburi.csv',0)
df_Kanchanaburi

In [None]:
df_Kanchanaburi_test = toDFtest('Kanchanaburi','Kanchanaburi (Thailand timezone).csv','3H_temperature_Kanchanaburi.csv','3H_wind_Kanchanaburi.csv')
df_Kanchanaburi_test

###Bangkok


In [None]:
df_Bangkok = toDF('Bangkok','Bangkok.txt','3H_temperature_Bangkok.csv','3H_wind_Bangkok.csv',0)
df_Bangkok

In [None]:
df_Bangkok_test = toDFtest('Bangkok','Bangkok (Thailand timezone).csv','3H_temperature_Bangkok.csv','3H_wind_Bangkok.csv')
df_Bangkok_test

###Khonkaen

In [None]:
df_Khonkaen = toDF('Khon Kaen','Khon_Kaen.txt','3H_temperature_Khon Kaen.csv','3H_wind_Khon Kaen.csv',0)
df_Khonkaen

In [None]:
df_Khonkaen_test = toDFtest('Khon Kaen','Khon Kaen (Thailand timezone).csv','3H_temperature_Khon Kaen.csv','3H_wind_Khon Kaen.csv')
df_Khonkaen_test

###Songkhla

In [None]:
df_Songkhla = toDF('Songkhla','Songkhla.txt','3H_temperature_Songkhla.csv','3H_wind_Songkhla.csv',0)
df_Songkhla

In [None]:
df_Songkhla_test = toDFtest('Songkhla','Songkhla (Thailand timezone).csv','3H_temperature_Songkhla.csv','3H_wind_Songkhla.csv')
df_Songkhla_test

**LSTM**

In [None]:
#Normalize training data
def normalize_data(df): # nomalize stock data
    min_max_scaler = preprocessing.MinMaxScaler() #min max scaler
    df['PM25'] = min_max_scaler.fit_transform(df.PM25.values.reshape(-1,1))
    df['Temp'] = min_max_scaler.fit_transform(df.Temp.values.reshape(-1,1))
    df['WindDir'] = min_max_scaler.fit_transform(df.WindDir.values.reshape(-1,1))
    df['WindSpeed'] = min_max_scaler.fit_transform(df.WindSpeed.values.reshape(-1,1))
    return df

In [None]:
def load_data(df,df_test):
    train_data = df.to_numpy()
    test_data = df_test.to_numpy()
    
    train = []
    for index in range(len(train_data) - 2): 
        train.append(train_data[index: index + 2]) 
    train = np.array(train)
    
    test = []
    for index in range(len(test_data) - 2): 
        test.append(test_data[index: index + 2]) 
    test = np.array(test)
    
    x_train = train[:, :-1] 
    y_train = train[:, -1][:,-1]
    
    x_test = test[:, :-1] 
    y_test = test[:, -1][:,-1]
    return [x_train,y_train,x_test,y_test]

In [None]:
from numpy.random import seed
import tensorflow
seed(5)
tensorflow.random.set_seed(5)
n_features = 4
prev_days = 1
def build_model(layers,opt,p,hidden_unit):
    ## model 1 vanilla lstm 
    seed(5)
    tensorflow.random.set_seed(5)
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3,strides=1, padding="causal",activation="linear",input_shape=[None, n_features]),)
    model.add(Dropout(p)) 
    model.add(LSTM(hidden_unit, activation='linear', input_shape=(prev_days, n_features)))
    model.add(Dropout(p)) 
    model.add(Dense(1,activation='linear',input_shape=(prev_days, n_features)))
    model.compile(loss='mse', optimizer=opt, metrics=['mse']) 
    return model

def build_model_khonkaen(layers,opt,p,hidden_unit):
    seed(5)
    tensorflow.random.set_seed(5)
    model = Sequential()
    model.add(Dropout(p)) 
    model.add(LSTM(hidden_unit, activation='linear', input_shape=(prev_days, n_features)))
    model.add(Dropout(p)) 
    model.add(Dense(1,activation='linear',input_shape=(prev_days, n_features)))
    model.compile(loss='mse', optimizer=opt, metrics=['mse']) 
    return model



In [None]:
def denormalize(df, normalized_value): 
    df = df['PM25'].values.reshape(-1,1)
    normalized_value = normalized_value.reshape(-1,1)

    min_max_scaler = preprocessing.MinMaxScaler()
    _ = min_max_scaler.fit_transform(df)
    denorm = min_max_scaler.inverse_transform(normalized_value)
    return denorm


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from matplotlib.pyplot import figure
def evaluate(newy_test, new_pred,y_test,yhat,df_normalize_test,province):   
    figure(num=None, figsize=(18, 6), dpi=80, facecolor='w', edgecolor='k')
    plt.title(province+' model predict vs ground truth')
    plt.plot(df_normalize_test.index[1:-1], newy_test, color='g',label='ground truth')
    plt.plot(df_normalize_test.index[:-2], new_pred, alpha=.7, color='r',label='predict')
    plt.legend(loc="upper right")
    
    plt.show()
     
    print("rsme :",math.sqrt(mean_squared_error(newy_test, new_pred))) #RMSE
    print("r^2 :",r2_score(newy_test, new_pred))

In [None]:
def train_and_test_model(df,df_test,province,opt,p,hidden_unit):
    n_cols = df.shape[1]
    if (province == "Khon Kaen"):
        model = build_model_khonkaen([n_features, prev_days, 1],opt,p,hidden_unit)
    else:
        model = build_model([n_features, prev_days, 1],opt,p,hidden_unit)
    df_normalize = normalize_data(df)
    df_normalize_test = normalize_data(df_test)
    X_train, y_train, X_test, y_test = load_data(df_normalize, df_normalize_test)
    tf.keras.backend.clear_session()
    my_callbacks = [
        tf.keras.callbacks.EarlyStopping(patience=2),
        tf.keras.callbacks.ModelCheckpoint(filepath=province+'_model.{epoch:02d}.h5',save_weights_only=True),
    ]
    hist = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100 ,verbose = 1, callbacks = my_callbacks)
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('model train vs validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper right')
    plt.show()
    yhat = model.predict(X_test)
    df = toDF(province,province.replace(" ","_")+'.txt','3H_temperature_'+province+'.csv','3H_wind_'+province+'.csv',0)
    df_test = toDFtest(province,province+" (Thailand timezone)"+'.csv','3H_temperature_'+province+'.csv','3H_wind_'+province+'.csv')
    new_pred = denormalize(df, yhat)
    newy_test = denormalize(df_test, y_test)
    evaluate(newy_test, new_pred,y_test,yhat,df_normalize_test,province)
    
    return [model,new_pred,newy_test]

In [None]:
#rmse 2.781862703243144
Bangkok_model,Bangkok_new_pred,Bangkok_newy_test=train_and_test_model(toDF('Bangkok','Bangkok.txt','3H_temperature_Bangkok.csv','3H_wind_Bangkok.csv',0),df_Bangkok_test,"Bangkok","RMSprop",0.081,32)

In [None]:
#rmse 3.6636290437799506 bfil adam 0.265 32 
#3.9574509270490803
Chanthaburi_model,Chanthaburi_new_pred,Chanthaburi_newy_test=train_and_test_model(toDF('Chanthaburi','Chanthaburi.txt','3H_temperature_Chanthaburi.csv','3H_wind_Chanthaburi.csv',0),df_Chanthaburi_test,"Chanthaburi","adam",0.2655,32)

In [None]:
#rmse  6.590754145964834 0.005 32
Chiangmai_model,Chiangmai_new_pred,Chiangmai_newy_test=train_and_test_model(toDF('Chiang Mai','Chiang_Mai.txt','3H_temperature_Chiang Mai.csv','3H_wind_Chiang Mai.csv',0),df_Chiangmai_test,"Chiang Mai","Rmsprop",0.005,32)

In [None]:
#rmse 4.594019316549222
Kanchanaburi_model,Kanchanaburi_new_pred,Kanchanaburi_newy_test=train_and_test_model(toDF('Kanchanaburi','Kanchanaburi.txt','3H_temperature_Kanchanaburi.csv','3H_wind_Kanchanaburi.csv',-35),df_Kanchanaburi_test,"Kanchanaburi","adam",0.9051,149)

In [None]:
# 5.55624075356698
Khonkaen_model,Khonkaen_new_pred,Khonkaen_newy_test=train_and_test_model(toDF('Khon Kaen','Khon_Kaen.txt','3H_temperature_Khon Kaen.csv','3H_wind_Khon Kaen.csv',-50),df_Khonkaen_test,"Khon Kaen","adam",0.741,1024)

In [None]:
# 3.878369039138354
Songkhla_model,Songkhla_new_pred,Songkhla_newy_test=train_and_test_model(toDF('Songkhla','Songkhla.txt','3H_temperature_Songkhla.csv','3H_wind_Songkhla.csv',0),df_Songkhla_test,"Songkhla","Rmsprop",0.24,90)

**Evaluate model**

In [None]:
output = pd.DataFrame()
output_test = pd.DataFrame()
df_Chanthaburi_new_pred = pd.DataFrame(Chanthaburi_new_pred)
df_Chanthaburi_newy_test = pd.DataFrame(Chanthaburi_newy_test)
for i in range(1112):
    i *= 6
    output = pd.concat([output, df_Chanthaburi_new_pred[i+4:i+76]])
    output_test = pd.concat([output_test, df_Chanthaburi_newy_test[i+4:i+76]])
output

In [None]:
df_Chiangmai_new_pred = pd.DataFrame(Chiangmai_new_pred)
df_Chiangmai_newy_test = pd.DataFrame(Chiangmai_newy_test)
for i in range(1112):
    i *= 6
    output = pd.concat([output, df_Chiangmai_new_pred[i+4:i+76]])
    output_test = pd.concat([output_test, df_Chiangmai_newy_test[i+4:i+76]])

In [None]:
df_Kanchanaburi_new_pred = pd.DataFrame(Kanchanaburi_new_pred)
df_Kanchanaburi_newy_test = pd.DataFrame(Kanchanaburi_newy_test)
for i in range(1112):
    i *= 6
    output = pd.concat([output, df_Kanchanaburi_new_pred[i+4:i+76]])
    output_test = pd.concat([output_test, df_Kanchanaburi_newy_test[i+4:i+76]])

In [None]:
df_Bangkok_new_pred = pd.DataFrame(Bangkok_new_pred)
df_Bangkok_newy_test = pd.DataFrame(Bangkok_newy_test)
for i in range(1112):
    i *= 6
    output = pd.concat([output, df_Bangkok_new_pred[i+4:i+76]])
    output_test = pd.concat([output_test, df_Bangkok_newy_test[i+4:i+76]])

In [None]:
df_Khonkaen_new_pred = pd.DataFrame(Khonkaen_new_pred)
df_Khonkaen_newy_test = pd.DataFrame(Khonkaen_newy_test)
for i in range(1110):
    i *= 6
    output = pd.concat([output, df_Khonkaen_new_pred[i+4:i+76]])
    output_test = pd.concat([output_test, df_Khonkaen_newy_test[i+4:i+76]])

In [None]:
df_Songkhla_new_pred = pd.DataFrame(Songkhla_new_pred)
df_Songkhla_newy_test = pd.DataFrame(Songkhla_newy_test)
for i in range(1127):
    i *= 6
    output = pd.concat([output, df_Songkhla_new_pred[i+4:i+76]])
    output_test = pd.concat([output_test, df_Songkhla_newy_test[i+4:i+76]])

In [None]:
#r2      0.930160 , rmse    4.702418
from sklearn.metrics import r2_score, mean_squared_error

def r2_rmse( p,r ):
    r2 = r2_score( p[0], r[0] )
    rmse = np.sqrt( mean_squared_error( p[0], r[0] ) )
    return pd.Series( dict(  r2 = r2, rmse = rmse ) )

r2_rmse(output, output_test)

In [None]:
df = pd.DataFrame({'Id': list(range(0, len(output))), 'Predicted': output.to_numpy().flatten()})
df.to_csv('submission.csv', index=False)
df = pd.DataFrame({'Id': list(range(0, len(output))), 'Predicted': output_test.to_numpy().flatten()})
df.to_csv('real.csv', index=False)

#Project 2

####Fire hotspot

In [None]:
def hotspot_to_DF(filename):
  data = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Fire hotspot/'+filename)
  df = pd.DataFrame(data)
  df.drop(columns=['scan','track','satellite','instrument','confidence','version','daynight','type'], inplace=True)
  df['date'] = pd.to_datetime(df['acq_date'])
  df['time'] = pd.to_datetime(df['acq_time'], format='%H%M').dt.time
  df['datetime'] = pd.to_datetime(df['date'].astype(str)+' '+df['time'].astype(str))
  df['datetime'] = df['datetime'].dt.tz_localize('UTC').dt.tz_convert('Asia/Bangkok')
  df['datetime'] = df['datetime'].dt.tz_localize(None)

  df.drop(columns=['date','time','acq_date','acq_time'], inplace=True)
  return df

In [None]:
df_hotspot_thailand = hotspot_to_DF('viirs-snpp_2016_Thailand.csv')
df_hotspot_thailand_2017 = hotspot_to_DF('viirs-snpp_2017_Thailand.csv')
df_hotspot_thailand_2018 = hotspot_to_DF('viirs-snpp_2018_Thailand.csv')
df_hotspot_thailand_2019 = hotspot_to_DF('viirs-snpp_2019_Thailand.csv')

df_hotspot_thailand = df_hotspot_thailand.append([df_hotspot_thailand_2017,df_hotspot_thailand_2018])
df_hotspot_thailand

In [None]:
df_hotspot_Cambodia = hotspot_to_DF('viirs-snpp_2016_Cambodia.csv')
df_hotspot_Cambodia_2017 = hotspot_to_DF('viirs-snpp_2017_Cambodia.csv')
df_hotspot_Cambodia_2018 = hotspot_to_DF('viirs-snpp_2018_Cambodia.csv')
df_hotspot_Cambodia_2019 = hotspot_to_DF('viirs-snpp_2019_Cambodia.csv')

df_hotspot_Cambodia = df_hotspot_Cambodia.append([df_hotspot_Cambodia_2017,df_hotspot_Cambodia_2018,df_hotspot_Cambodia_2019])
df_hotspot_Cambodia

In [None]:
test=df_hotspot_Cambodia.groupby("datetime").filter(lambda x: True)
test

In [None]:
df_hotspot_Lao = hotspot_to_DF('viirs-snpp_2016_Lao_PDR.csv')
df_hotspot_Lao_2017 = hotspot_to_DF('viirs-snpp_2017_Lao_PDR.csv')
df_hotspot_Lao_2018 = hotspot_to_DF('viirs-snpp_2018_Lao_PDR.csv')
df_hotspot_Lao_2019 = hotspot_to_DF('viirs-snpp_2019_Lao_PDR.csv')

df_hotspot_Lao = df_hotspot_Lao.append([df_hotspot_Lao_2017,df_hotspot_Lao_2018,df_hotspot_Lao_2019])
df_hotspot_Lao

In [None]:
df_hotspot_Myanmar = hotspot_to_DF('viirs-snpp_2016_Myanmar.csv')
df_hotspot_Myanmar_2017 = hotspot_to_DF('viirs-snpp_2017_Myanmar.csv')
df_hotspot_Myanmar_2018 = hotspot_to_DF('viirs-snpp_2018_Myanmar.csv')
df_hotspot_Myanmar_2019 = hotspot_to_DF('viirs-snpp_2019_Myanmar.csv')

df_hotspot_Myanmar = df_hotspot_Myanmar.append([df_hotspot_Myanmar_2017,df_hotspot_Myanmar_2018,df_hotspot_Myanmar_2019])
df_hotspot_Myanmar

In [None]:
df_hotspot = df_hotspot_thailand.append([df_hotspot_Cambodia,df_hotspot_Lao,df_hotspot_Myanmar])
df_hotspot['month'] = pd.to_datetime(df_hotspot['datetime']).dt.month
df_hotspot['year'] = pd.to_datetime(df_hotspot['datetime']).dt.year
df_hotspot['month_year'] = pd.to_datetime(df_hotspot[['year', 'month']].assign(DAY=1)).dt.strftime("%Y-%m")
df_hotspot
df_hotspot.to_csv('hotspot.csv')

We've got the hotspot dataframe

variable: **df_hotspot**

####Temp

In [None]:
def windandtemp_to_DF(city,filename):
    data = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Train/'+filename)
    df = pd.DataFrame(data)
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    data_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Test/'+filename)
    df_test = pd.DataFrame(data_test)
    df_test['datetime'] = pd.to_datetime(df_test['datetime'])
    
    df = pd.concat([df, df_test])
    df['City'] = city
    return df

In [None]:
df_temp= windandtemp_to_DF('Bangkok','3H_temperature_Bangkok.csv')
df_temp_output = pd.DataFrame(df_temp)
df_temp= windandtemp_to_DF('Chanthaburi','3H_temperature_Chanthaburi.csv')
df_temp_output = pd.concat([df_temp_output, df_temp])
df_temp= windandtemp_to_DF('Chiang Mai','3H_temperature_Chiang Mai.csv')
df_temp_output = pd.concat([df_temp_output, df_temp])
df_temp= windandtemp_to_DF('Kanchanaburi','3H_temperature_Kanchanaburi.csv')
df_temp_output = pd.concat([df_temp_output, df_temp])
df_temp= windandtemp_to_DF('Khon Kaen','3H_temperature_Khon Kaen.csv')
df_temp_output = pd.concat([df_temp_output, df_temp])
df_temp= windandtemp_to_DF('Songkhla','3H_temperature_Songkhla.csv')
df_temp_output = pd.concat([df_temp_output, df_temp])

df_temp_output

In [None]:
df = pd.DataFrame(df_temp_output)
df['month'] = pd.to_datetime(df['datetime']).dt.month
df['year'] = pd.to_datetime(df['datetime']).dt.year
df['month_year'] = pd.to_datetime(df[['year', 'month']].assign(DAY=1)).dt.strftime("%Y-%m")
df.to_csv('temp.csv', index=False)

####Wind

In [None]:
df_wind= windandtemp_to_DF('Bangkok','3H_wind_Bangkok.csv')
df_wind_output = pd.DataFrame(df_wind)
df_wind= windandtemp_to_DF('Chanthaburi','3H_wind_Chanthaburi.csv')
df_wind_output = pd.concat([df_wind_output, df_wind])
df_wind= windandtemp_to_DF('Chiang Mai','3H_wind_Chiang Mai.csv')
df_wind_output = pd.concat([df_wind_output, df_wind])
df_wind= windandtemp_to_DF('Kanchanaburi','3H_wind_Kanchanaburi.csv')
df_wind_output = pd.concat([df_wind_output, df_wind])
df_wind= windandtemp_to_DF('Khon Kaen','3H_wind_Khon Kaen.csv')
df_wind_output = pd.concat([df_wind_output, df_wind])
df_wind= windandtemp_to_DF('Songkhla','3H_wind_Songkhla.csv')
df_wind_output = pd.concat([df_wind_output, df_wind])
df_wind_output

In [None]:
df = pd.DataFrame(df_wind_output)
df['month'] = pd.to_datetime(df['datetime']).dt.month
df['year'] = pd.to_datetime(df['datetime']).dt.year
df['month_year'] = pd.to_datetime(df[['year', 'month']].assign(DAY=1)).dt.strftime("%Y-%m")
df.to_csv('wind.csv', index=False)

####PM2.5

In [None]:
def pm_to_DF_train(city,filename):
    
    pm25 = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Train/'+ filename, sep='\t', header=None, skiprows=10)
    pm25['Time'] = pd.to_datetime({'year': pm25[0], 'month': pm25[1], 'day':pm25[2],'hour': pm25[3]})
    pm25=pm25.drop([0,1,2,3,5,6],axis=1)
    pm25['Time'] = pm25['Time'].dt.tz_localize('UTC').dt.tz_convert('Asia/Bangkok')
    pm25['Time'] = pm25['Time'].dt.tz_localize(None)
    pm25.columns = ['PM25', 'datetime']
    pm25 = pm25[~pm25.datetime.duplicated(keep='first')]
    pm25.datetime = pd.DatetimeIndex(pm25.datetime)

    return pm25

In [None]:
def pm_to_DF_test(city,filename):
    data = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Test/'+ filename)
    pm25 = pd.DataFrame(data)
    pm25['datetime'] = pd.to_datetime(pm25['Time'])
    pm25['PM25'] = pm25['PM2.5(µg/m3)']
    pm25.drop(columns=['Time','PM2.5(µg/m3)'],inplace=True)
    
    df['City'] = city
    return pm25

In [None]:
def latlong_to_DF(df,city,filename):
    data = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/' + city + '/Train/'+filename)
    df['lat'] = data['lat'][0]
    df['long'] = data['long'][0]
    df['City'] = city
    return df

In [None]:
df_pm_Bangkok= pm_to_DF_train('Bangkok','Bangkok.txt')
df_pm_output_Bangkok = pd.DataFrame(df_pm_Bangkok)
df_pm_Bangkok = pm_to_DF_test('Bangkok','Bangkok (Thailand timezone).csv')
df_pm_output_Bangkok = pd.concat([df_pm_output_Bangkok, df_pm_Bangkok])
df_pm_output_Bangkok = latlong_to_DF(df_pm_output_Bangkok,'Bangkok','3H_temperature_Bangkok.csv')

df_pm_output_Bangkok

In [None]:
df_pm_Chanthaburi= pm_to_DF_train('Chanthaburi','Chanthaburi.txt')
df_pm_output_Chanthaburi = pd.DataFrame(df_pm_Chanthaburi)
df_pm_Chanthaburi = pm_to_DF_test('Chanthaburi','Chanthaburi (Thailand timezone).csv')
df_pm_output_Chanthaburi = pd.concat([df_pm_output_Chanthaburi, df_pm_Chanthaburi])
df_pm_output_Chanthaburi = latlong_to_DF(df_pm_output_Chanthaburi,'Chanthaburi','3H_temperature_Chanthaburi.csv')

df_pm_output_Chanthaburi


In [None]:
df_pm_Chiangmai= pm_to_DF_train('Chiang Mai','Chiang_Mai.txt')
df_pm_output_Chiangmai = pd.DataFrame(df_pm_Chiangmai)
df_pm_Chiangmai = pm_to_DF_test('Chiang Mai','Chiang Mai (Thailand timezone).csv')
df_pm_output_Chiangmai = pd.concat([df_pm_output_Chiangmai, df_pm_Chiangmai])
df_pm_output_Chiangmai = latlong_to_DF(df_pm_output_Chiangmai,'Chiang Mai','3H_temperature_Chiang Mai.csv')

df_pm_output_Chiangmai


In [None]:
df_pm_Kanchanaburi= pm_to_DF_train('Kanchanaburi','Kanchanaburi.txt')
df_pm_output_Kanchanaburi = pd.DataFrame(df_pm_Kanchanaburi)
df_pm_Kanchanaburi = pm_to_DF_test('Kanchanaburi','Kanchanaburi (Thailand timezone).csv')
df_pm_output_Kanchanaburi = pd.concat([df_pm_output_Kanchanaburi, df_pm_Kanchanaburi])
df_pm_output_Kanchanaburi = latlong_to_DF(df_pm_output_Kanchanaburi,'Kanchanaburi','3H_temperature_Kanchanaburi.csv')

df_pm_output_Kanchanaburi


In [None]:
df_pm_Khonkaen= pm_to_DF_train('Khon Kaen','Khon_Kaen.txt')
df_pm_output_Khonkaen = pd.DataFrame(df_pm_Khonkaen)
df_pm_Khonkaen = pm_to_DF_test('Khon Kaen','Khon Kaen (Thailand timezone).csv')
df_pm_output_Khonkaen = pd.concat([df_pm_output_Khonkaen, df_pm_Khonkaen])
df_pm_output_Khonkaen = latlong_to_DF(df_pm_output_Khonkaen,'Khon Kaen','3H_temperature_Khon Kaen.csv')

df_pm_output_Khonkaen


In [None]:
df_pm_Songkhla= pm_to_DF_train('Songkhla','Songkhla.txt')
df_pm_output_Songkhla = pd.DataFrame(df_pm_Songkhla)
df_pm_Songkhla = pm_to_DF_test('Songkhla','Songkhla (Thailand timezone).csv')
df_pm_output_Songkhla = pd.concat([df_pm_output_Songkhla, df_pm_Songkhla])
df_pm_output_Songkhla = latlong_to_DF(df_pm_output_Songkhla,'Songkhla','3H_temperature_Songkhla.csv')

df_pm_output_Songkhla


In [None]:
df_pm25 = pd.DataFrame(df_pm_output_Bangkok)
df_pm25 = pd.concat([df_pm25,df_pm_output_Chanthaburi])
df_pm25 = pd.concat([df_pm25,df_pm_output_Chiangmai])
df_pm25 = pd.concat([df_pm25,df_pm_output_Kanchanaburi])
df_pm25 = pd.concat([df_pm25,df_pm_output_Khonkaen])
df_pm25 = pd.concat([df_pm25,df_pm_output_Songkhla])
df_pm25

In [None]:
df = pd.DataFrame(df_pm25)
df.to_csv('PM25.csv', index=True)

**DBSCAN**

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
import matplotlib

In [None]:
pm2016 = pd.read_csv('../input/pm2520212/PM2.5(2016).csv')
pm2017 = pd.read_csv('../input/pm2520212/PM2.5(2017).csv')
pm2018 = pd.read_csv('../input/pm2520212/PM2.5(2018).csv')
pm2019 = pd.read_csv('../input/pm2520212/PM2.5(2019).csv')
pm2020 = pd.read_csv('../input/pm2520212/PM2.5(2020).csv')
latlong = pd.read_csv('../input/pm2520212/latlng.csv')

In [None]:
pm2019.drop(columns=['70T*','70T**'],inplace=True)

In [None]:
pm2016.drop([366,367],inplace=True)
pm2017.drop([365,366],inplace=True)
pm2018.drop([365,366],inplace=True)
pm2019.drop([365,366],inplace=True)
pm2020.drop([366,367],inplace=True)

In [None]:
df_pm25 = pd.concat([pm2016,pm2017,pm2018,pm2019,pm2020],keys=['2016','2017','2018','2019','2020'])
df_pm25

In [None]:
df_pm25['month_year'] = pd.to_datetime(df_pm25['Date']).dt.to_period('M')
df_pm25['year'] = pd.DatetimeIndex(df_pm25['Date']).year
df_pm25['month'] = pd.DatetimeIndex(df_pm25['Date']).month
df_pm25

In [None]:
df_pm25 = df_pm25.melt(id_vars=['Date','month_year','year','month'], 
        var_name="Station", 
        value_name="PM25")

In [None]:
df_pm25

In [None]:
df_pm25_latlong = df_pm25.merge(latlong, left_on='Station', right_on='title')
df_pm25_latlong.drop(columns=['Date', 'Station', 'title'],inplace=True)
df_clean = df_pm25_latlong.dropna()

In [None]:
df_clean.reset_index(inplace=True)

In [None]:
k_pm25 = df_clean.drop(columns=['index'])
years = k_pm25['year'].unique()
months = [1,2,3,4,5,6,7,8,9,10,11,12]
k_pm25

In [None]:
def cluster(df_pm25):
    dbscan=DBSCAN()
    dbscan.fit(df_pm25[["Lat", "Lng"]])
    df_pm25['DBSCAN_labels']=dbscan.labels_
    tmp = df_pm25['DBSCAN_labels']
    return tmp

In [None]:
k_pm25_mean = k_pm25
data_all = pd.DataFrame()
for i in years:
    for j in months:
        data = cluster(k_pm25_mean[(k_pm25_mean['year'] == i) & (k_pm25_mean['month'] == j)])
        data_all = pd.concat([data_all,data])
k_pm25_mean = k_pm25_mean.join(data_all)

In [None]:
k_pm25_mean.rename(columns = {0: 'color'}, inplace = True)
k_pm25_mean

In [None]:
k_pm25_mean.to_csv('k_pm25_color.csv', index=False)

**Clustering**