<a href="https://colab.research.google.com/github/sohnhs21/pm2.5/blob/main/untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install numpy pandas tensorflow keras sklearn

one model

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
import random
import os

np.random.seed(46)
tf.random.set_seed(46)

def has_missing_values(array):
    return np.isnan(array).any()

def prepare_data(data, lookback, future):
    X, Y = [], []
    for i in range(len(data) - lookback - future):
        X.append(data[i : (i + lookback), :])
        Y.append(data[(i + lookback) : (i + lookback + future), 0])  
    return np.array(X), np.array(Y)

def prepare_data_with_missing(data, lookback, future):
    X, Y, missing_indices = [], [], []
    for i in range(len(data) - lookback - future):
        input_sequence = data[i : (i + lookback), :]
        output_sequence = data[(i + lookback) : (i + lookback + future), 0]

        if has_missing_values(input_sequence):
            continue

        X.append(input_sequence)

        if has_missing_values(output_sequence):
            missing_indices.append(i)
        else:
            Y.append(output_sequence)

    return np.array(X), np.array(Y), missing_indices

lookback = 2 * 24  # 2 days of hourly data
future = 3 * 24    # 3 days of hourly data
n_features = 6     # Number of features in the data

locations = ['공주', '노은동', '논산', '대천2동', '독곶리', '동문동', '모종동', '문창동', '성성동', '신방동', '신흥동', '아름동', '예산군', '읍내동', '이원면', '정림동', '홍성읍']  # List all 17 locations

X_train_all = np.empty((0, lookback, n_features))
Y_train_all = np.empty((0, future))
scalers = {}

for location in locations:
    train_data_loc = sorted_data[sorted_data['지점'] == location]
    
    # Drop the '지점' and '관측시점' columns
    train_data_loc = train_data_loc.drop(['지점', '관측시점'], axis=1).values
    
    # Scale the data
    scaler = MinMaxScaler()
    train_data_loc = scaler.fit_transform(train_data_loc)
    scalers[location] = scaler

    # Prepare the data for LSTM
    X_train, Y_train = prepare_data(train_data_loc, lookback, future)
    
    X_train_all = np.vstack((X_train_all, X_train))
    Y_train_all = np.vstack((Y_train_all, Y_train))

# Create the LSTM model
model = Sequential()
model.add(LSTM(256, activation='tanh', input_shape=(lookback, n_features), return_sequences=True))
model.add(Dropout(0.2))  # Add dropout with a dropout rate of 0.2
model.add(LSTM(256, activation='tanh'))
model.add(Dropout(0.2))  # Add dropout
model.add(Dense(future))
model.compile(optimizer='adam', loss='mae')

# Add EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with validation split
history = model.fit(X_train_all, Y_train_all, epochs=30, verbose=0, validation_split=0.2, callbacks=[early_stopping])

# Get the train and validation losses
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Make predictions
Y_train_pred = model.predict(X_train_all)
Y_val_pred = model.predict(X_train_all[-int(0.2 * len(X_train_all)):])

# Calculate the mean absolute error
train_mae = mean_absolute_error(Y_train_all, Y_train_pred)
val_mae = mean_absolute_error(Y_train_all[-int(0.2 * len(Y_train_all)):], Y_val_pred)

print(f"Train MAE: {train_mae}, Validation MAE: {val_mae}")

all_test_data_real = pd.DataFrame()

for location in locations:
    test_data_loc = sort_data[sort_data['지점'] == location]
    
    # Drop the '지점' and '관측시점' columns
    test_data_loc = test_data_loc.drop(['지점', '관측시점'], axis=1).values
    
    # Scale the data
    scaler = scalers[location]
    test_data_loc = scaler.transform(test_data_loc)

    # Prepare the data for LSTM
    X_test, Y_test, missing_indices = prepare_data_with_missing(test_data_loc, lookback, future)

    Y_test_missing_pred = model.predict(X_test)

    for i, missing_index in enumerate(missing_indices):
        start = missing_index + lookback
        end = start + future
        test_data_loc[start:end, 0] = Y_test_missing_pred[i]

    # Inverse transform the test data to get real values
    test_data_loc_real = scaler.inverse_transform(test_data_loc)

    # Create a new DataFrame with real values
    test_data_loc_real_df = pd.DataFrame(test_data_loc_real, columns=['PM2.5', 'tem', 'wind degree', 'wind velocity', 'precipitation', 'humidity'])

    # Get the '지점' and '관측시점' columns from the original test data
    test_data_loc_observation = sort_data.loc[sort_data['지점'] == location, ['지점', '관측시점']].reset_index(drop=True)

    # Concatenate '지점' and '관측시점' columns with the test_data_loc_real_df DataFrame
    test_data_loc_real_df = pd.concat([test_data_loc_observation, test_data_loc_real_df], axis=1)

    # Append the test_data_loc_real_df to the all_test_data_real DataFrame
    all_test_data_real = all_test_data_real.append(test_data_loc_real_df, ignore_index=True)    

# Sort the DataFrame by '지점' and '관측시점'
all_test_data_real = all_test_data_real.sort_values(by=['지점', '관측시점']).reset_index(drop=True)

# Print the resulting DataFrame
all_test_data_real.to_csv('/content/new/result_46.csv', index=False)

one model and one hot incoding

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
import random
import os

np.random.seed(47)
tf.random.set_seed(47)

def has_missing_values(array):
    return np.isnan(array).any()

def prepare_data(data, lookback, future):
    X, Y = [], []
    for i in range(len(data) - lookback - future):
        X.append(data[i : (i + lookback), :])
        Y.append(data[(i + lookback) : (i + lookback + future), 0])  
    return np.array(X), np.array(Y)

def prepare_data_with_missing(data, lookback, future):
    X, Y, missing_indices = [], [], []
    for i in range(len(data) - lookback - future):
        input_sequence = data[i : (i + lookback), :]
        output_sequence = data[(i + lookback) : (i + lookback + future), 0]

        if has_missing_values(input_sequence) or has_missing_values(output_sequence):
            continue

        X.append(input_sequence)
        Y.append(output_sequence)

        if np.isnan(output_sequence).any():
            missing_indices.append(i)

    return np.array(X), np.array(Y), missing_indices

lookback = 2 * 24  # 2 days of hourly data
future = 3 * 24    # 3 days of hourly data
n_features = 6 + 17  # Number of features in the data

locations = ['공주', '노은동', '논산', '대천2동', '독곶리', '동문동', '모종동', '문창동', '성성동', '신방동', '신흥동', '아름동', '예산군', '읍내동', '이원면', '정림동', '홍성읍']  # List all 17 locations

X_train_all = np.empty((0, lookback, n_features))
Y_train_all = np.empty((0, future))
scalers = {}

encoder = OneHotEncoder(sparse=False)
encoded_locations = encoder.fit_transform(sorted_data['지점'].values.reshape(-1, 1))
encoded_test_locations = encoder.transform(sort_data['지점'].values.reshape(-1, 1))

for location in locations:
    train_data_loc = sorted_data[sorted_data['지점'] == location]
    
    # Drop the '지점' and '관측시점' columns
    train_data_loc = train_data_loc.drop(['지점', '관측시점'], axis=1).values
    
    # Scale the data
    scaler = MinMaxScaler()
    train_data_loc = scaler.fit_transform(train_data_loc)
    scalers[location] = scaler

    # Add one-hot encoded location to the train_data_loc
    location_encoded = encoded_locations[sorted_data['지점'] == location]
    train_data_loc = np.hstack((location_encoded, train_data_loc))

    # Prepare the data for LSTM
    X_train, Y_train = prepare_data(train_data_loc, lookback, future)
    
    X_train_all = np.vstack((X_train_all, X_train))
    Y_train_all = np.vstack((Y_train_all, Y_train))

# Create the LSTM model
model = Sequential()
model.add(LSTM(256, activation='tanh', input_shape=(lookback, n_features), return_sequences=True))
model.add(Dropout(0.2))  # Add dropout with a dropout rate of 0.2
model.add(LSTM(256, activation='tanh'))
model.add(Dropout(0.2))  # Add dropout
model.add(Dense(future))
model.compile(optimizer='adam', loss='mae')

# Add EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model with validation split
history = model.fit(X_train_all, Y_train_all, epochs=50, verbose=0, validation_split=0.2, callbacks=[early_stopping])

# Get the train and validation losses
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Make predictions
Y_train_pred = model.predict(X_train_all)
Y_val_pred = model.predict(X_train_all[-int(0.2 * len(X_train_all)):])

# Calculate the mean absolute error
train_mae = mean_absolute_error(Y_train_all, Y_train_pred)
val_mae = mean_absolute_error(Y_train_all[-int(0.2 * len(Y_train_all)):], Y_val_pred)

print(f"Train MAE: {train_mae}, Validation MAE: {val_mae}")

all_test_data_real = pd.DataFrame()

for location in locations:
    test_data_loc = sort_data[sort_data['지점'] == location]
    
    # Drop the '지점' and '관측시점' columns
    test_data_loc = test_data_loc.drop(['지점', '관측시점'], axis=1).values
    
    # Scale the data
    scaler = scalers[location]
    test_data_loc = scaler.transform(test_data_loc)

    # Add one-hot encoded location to the test_data_loc
    test_location_encoded = encoded_test_locations[sort_data['지점'] == location]
    test_data_loc = np.hstack((test_location_encoded, test_data_loc))

    # Prepare the data for LSTM
    X_test, Y_test, missing_indices = prepare_data_with_missing(test_data_loc, lookback, future)

    Y_test_missing_pred = model.predict(X_test)

    for i, missing_index in enumerate(missing_indices):
        start = missing_index + lookback
        end = start + future
        test_data_loc[start:end, 0] = Y_test_missing_pred[i]

    # Inverse transform the test data to get real values
    non_location_columns = test_data_loc[:, 17:]
    test_data_loc_real = scaler.inverse_transform(non_location_columns)

    # Create a new DataFrame with real values
    test_data_loc_real_df = pd.DataFrame(test_data_loc_real, columns=['PM2.5', 'tem', 'wind degree', 'wind velocity', 'precipitation', 'humidity'])

    # Get the '지점' and '관측시점' columns from the original test data
    test_data_loc_observation = sort_data.loc[sort_data['지점'] == location, ['지점', '관측시점']].reset_index(drop=True)

    # Concatenate '지점' and '관측시점' columns with the test_data_loc_real_df DataFrame
    test_data_loc_real_df = pd.concat([test_data_loc_observation, test_data_loc_real_df], axis=1)

    # Append the test_data_loc_real_df to the all_test_data_real DataFrame
    all_test_data_real = all_test_data_real.append(test_data_loc_real_df, ignore_index=True)    

# Sort the DataFrame by '지점' and '관측시점'
all_test_data_real = all_test_data_real.sort_values(by=['지점', '관측시점']).reset_index(drop=True)

# Print the resulting DataFrame
all_test_data_real.to_csv('/content/new/result_47.csv', index=False)

one model and one hot incoding and hyper parameter

In [None]:
pip install keras-tuner

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from kerastuner import HyperModel, RandomSearch

np.random.seed(51)
tf.random.set_seed(51)

def has_missing_values(array):
    return np.isnan(array).any()

def prepare_data(data, lookback, future):
    X, Y = [], []
    for i in range(len(data) - lookback - future):
        X.append(data[i : (i + lookback), :])
        Y.append(data[(i + lookback) : (i + lookback + future), 17])  
    return np.array(X), np.array(Y)

def prepare_data_with_missing(data, lookback, future):
    X, Y, missing_indices = [], [], []
    for i in range(len(data) - lookback - future):
        input_sequence = data[i : (i + lookback), :]
        output_sequence = data[(i + lookback) : (i + lookback + future), 17]

        if has_missing_values(input_sequence):
            continue

        X.append(input_sequence)

        if has_missing_values(output_sequence):
            missing_indices.append(i)
        else:
            Y.append(output_sequence)

    return np.array(X), np.array(Y), missing_indices

class AirQualityLSTMHyperModel(HyperModel):
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape

    def build(self, hp):
        model = Sequential()

        model.add(
            LSTM(
                units=hp.Int("lstm_units", min_value=32, max_value=512, step=32),
                activation="tanh",
                input_shape=self.input_shape,
                return_sequences=True,
            )
        )
        model.add(Dropout(hp.Float("dropout_1", min_value=0.1, max_value=0.5, step=0.1)))

        model.add(
            LSTM(
                units=hp.Int("lstm_units", min_value=32, max_value=512, step=32),
                activation="tanh",
            )
        )
        model.add(Dropout(hp.Float("dropout_2", min_value=0.1, max_value=0.5, step=0.1)))

        model.add(Dense(self.output_shape))

        model.compile(
            optimizer=tf.keras.optimizers.Adam(
                hp.Float("learning_rate", min_value=1e-5, max_value=1e-2, sampling="LOG")
            ),
            loss="mae",
        )

        return model

lookback = 2 * 24  # 2 days of hourly data
future = 3 * 24    # 3 days of hourly data
n_features = 17 + 6  # 17 one-hot encoded location columns and 6 other features in the data

locations = ['공주', '노은동', '논산', '대천2동', '독곶리', '동문동', '모종동', '문창동', '성성동', '신방동', '신흥동', '아름동', '예산군', '읍내동', '이원면', '정림동', '홍성읍']  # List all 17 locations

X_train_all = np.empty((0, lookback, n_features))
Y_train_all = np.empty((0, future))
scalers = {}

locations = ['공주', '노은동', '논산', '대천2동', '독곶리', '동문동', '모종동', '문창동', '성성동', '신방동', '신흥동', '아름동', '예산군', '읍내동', '이원면', '정림동', '홍성읍']  # List all 17 locations

X_train_all = np.empty((0, lookback, n_features))
Y_train_all = np.empty((0, future))
scalers = {}

# Add EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

encoder = OneHotEncoder(sparse=False)
encoded_locations = encoder.fit_transform(sorted_data['지점'].values.reshape(-1, 1))
encoded_test_locations = encoder.transform(sort_data['지점'].values.reshape(-1, 1))

for location in locations:
    train_data_loc = sorted_data[sorted_data['지점'] == location]
    
    # Drop the '지점' and '관측시점' columns
    train_data_loc = train_data_loc.drop(['지점', '관측시점'], axis=1).values
    
    # Scale the data
    scaler = MinMaxScaler()
    train_data_loc[:, 0:] = scaler.fit_transform(train_data_loc[:, 0:])
    scalers[location] = scaler

    # Add one-hot encoded location to the train_data_loc
    location_encoded = encoded_locations[sorted_data['지점'] == location]
    train_data_loc = np.hstack((location_encoded, train_data_loc))

    # Prepare the data for LSTM
    X_train, Y_train = prepare_data(train_data_loc, lookback, future)
    
    X_train_all = np.vstack((X_train_all, X_train))
    Y_train_all = np.vstack((Y_train_all, Y_train))

# Define the hypermodel
hypermodel = AirQualityLSTMHyperModel(input_shape=(lookback, n_features), output_shape=future)

# Define the tuner
tuner = RandomSearch(
    hypermodel,
    objective="val_loss",
    max_trials=20,
    executions_per_trial=1,
    directory="tuning",
    project_name="air_quality_lstm",
)

# Set the early stopping callback
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Search for the best hyperparameters
tuner.search(X_train_all, Y_train_all, epochs=100, verbose=0, validation_split=0.2, callbacks=[early_stopping])

# Get the optimal hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"Best Hyperparameters:")
print(f"lstm_units: {best_hyperparameters.get('lstm_units')}")
print(f"dropout_1: {best_hyperparameters.get('dropout_1')}")
print(f"dropout_2: {best_hyperparameters.get('dropout_2')}")
print(f"learning_rate: {best_hyperparameters.get('learning_rate')}")

# Build the best model
best_model = tuner.hypermodel.build(best_hyperparameters)

# Train the best model with validation split
history = best_model.fit(X_train_all, Y_train_all, epochs=100, verbose=0, validation_split=0.2, callbacks=[early_stopping])

# Get the train and validation losses
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Make predictions
Y_train_pred = model.predict(X_train_all)
Y_val_pred = model.predict(X_train_all[-int(0.2 * len(X_train_all)):])

# Calculate the mean absolute error
train_mae = mean_absolute_error(Y_train_all, Y_train_pred)
val_mae = mean_absolute_error(Y_train_all[-int(0.2 * len(Y_train_all)):], Y_val_pred)

print(f"Train MAE: {train_mae}, Validation MAE: {val_mae}")

all_test_data_real = pd.DataFrame()

for location in locations:
    test_data_loc = sort_data[sort_data['지점'] == location]
    
    # Drop the '지점' and '관측시점' columns
    test_data_loc = test_data_loc.drop(['지점', '관측시점'], axis=1).values
    
    # Scale the data
    scaler = scalers[location]
    test_data_loc[:, 0:] = scaler.transform(test_data_loc[:, 0:])

    # Add one-hot encoded location to the test_data_loc
    test_location_encoded = encoded_test_locations[sort_data['지점'] == location]
    test_data_loc = np.hstack((test_location_encoded, test_data_loc))

    # Prepare the data for LSTM
    X_test, Y_test, missing_indices = prepare_data_with_missing(test_data_loc, lookback, future)

    Y_test_missing_pred = model.predict(X_test)

    for i, missig_index in enumerate(missing_indices):
        start = missig_index + lookback
        end = start + future
        test_data_loc[start:end, 17] = Y_test_missing_pred[i]

    # Inverse transform the test data to get real values
    non_location_columns = test_data_loc[:, 17:]
    test_data_loc_real = scaler.inverse_transform(non_location_columns)

    # Create a new DataFrame with real values
    
    test_data_loc_real_df = pd.DataFrame(test_data_loc_real, columns=['PM2.5', 'tem', 'wind degree', 'wind velocity', 'precipitation', 'humidity'])

    # Get the '지점' and '관측시점' columns from the original test data
    test_data_loc_observation = sort_data.loc[sort_data['지점'] == location, ['지점', '관측시점']].reset_index(drop=True)

    # Concatenate '지점' and '관측시점' columns with the test_data_loc_real_df DataFrame
    test_data_loc_real_df = pd.concat([test_data_loc_observation, test_data_loc_real_df], axis=1)

    # Append the test_data_loc_real_df to the all_test_data_real DataFrame
    all_test_data_real = all_test_data_real.append(test_data_loc_real_df, ignore_index=True) 

# Sort the DataFrame by '지점' and '관측시점'
all_test_data_real = all_test_data_real.sort_values(by=['지점', '관측시점']).reset_index(drop=True)

# Print the resulting DataFrame
all_test_data_real.to_csv('/content/new/result_50.csv', index=False)

In [None]:
all_test_data_real.head(100)

result = all_test_data_real
new_column = {'일시' : '관측시점'}
result = result.rename(columns= new_column)
result
result.to_csv('/content/new/result_50.csv', index=False)
result.head(50)

In [None]:
os.chdir('/content/sample')
all_files = glob.glob('*.csv')
dfs = []
for file in all_files:
    df = pd.read_csv(file)
    dfs.append(df)

answer = pd.concat(dfs, ignore_index=True)
new_column = {'측정소' : '지점'}
answer = answer.rename(columns= new_column)
answer['관측시점'] = answer['연도'].apply(convert_year) + '-' + answer['일시']
answer

In [None]:
# Merge the result DataFrame with the answer sheet
final_data = answer.merge(result[['관측시점', '지점', 'PM2.5']], on=['관측시점', '지점'], how='left')
final_data.drop(['PM2.5_x', '관측시점'], axis=1, inplace=True)
final_data.rename(columns={'PM2.5_y': 'PM2.5'}, inplace=True)
final_data.rename(columns={'지점': '측정소'}, inplace=True)
final_data

final_data.to_csv('/content/new/ans_50.csv', index=False)