In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.impute import SimpleImputer

In [2]:
historical_weather = pd.read_csv(r"C:\0A___________________________________\Predictor 1.0\historical_weather.csv")
test_data = pd.read_csv(r"C:\0A___________________________________\Predictor 1.0\submission_key.csv")

In [3]:
historical_weather.drop(columns=['snow_depth_mm'], inplace=True)

In [4]:
historical_weather['date'] = pd.to_datetime(historical_weather['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

In [5]:
historical_weather['city_id'] = historical_weather['city_id'].str.extract('(\d+)').astype(int)
test_data['city_id'] = test_data['city_id'].str.extract('(\d+)').astype(int)

In [6]:
imputer = SimpleImputer(strategy='mean')

In [7]:
historical_weather[['avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm','avg_wind_dir_deg', 'avg_wind_speed_kmh']] = imputer.fit_transform(historical_weather[['avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm','avg_wind_dir_deg', 'avg_wind_speed_kmh']])

In [8]:
historical_weather['temp_range'] = historical_weather['max_temp_c'] - historical_weather['min_temp_c']
historical_weather['cum_precip'] = historical_weather['precipitation_mm'].cumsum()
historical_weather['temp_precip_interaction'] = historical_weather['avg_temp_c'] * historical_weather['precipitation_mm']

In [9]:
historical_weather['rolling_mean_temp'] = historical_weather['avg_temp_c'].rolling(window=7).mean()
historical_weather['rolling_std_temp'] = historical_weather['avg_temp_c'].rolling(window=7).std()
historical_weather['rolling_sum_precip'] = historical_weather['precipitation_mm'].rolling(window=7).sum()

In [10]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week
    return df

historical_weather = create_features(historical_weather)

In [11]:
def create_lag_features(df, lags, cols):
    for col in cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df.groupby('city_id')[col].shift(lag)
    return df

In [12]:
lag_features = ['avg_temp_c']
lags = [1, 2, 3, 4, 5, 6, 7]  # 1-day, 2-day, 3-day, 7-day, 14-day, and 30-day lags
historical_weather = create_lag_features(historical_weather, lags, lag_features)

In [13]:
historical_weather

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,avg_wind_dir_deg,avg_wind_speed_kmh,temp_range,cum_precip,...,dayofyear,dayofmonth,weekofyear,avg_temp_c_lag_1,avg_temp_c_lag_2,avg_temp_c_lag_3,avg_temp_c_lag_4,avg_temp_c_lag_5,avg_temp_c_lag_6,avg_temp_c_lag_7
0,1,2014-01-01,6.6,-1.4,11.6,3.679964,168.000000,6.2,13.0,3.679964,...,1,1,1,,,,,,,
1,1,2014-01-02,9.3,6.3,13.3,3.679964,155.000000,10.0,7.0,7.359929,...,2,2,1,6.6,,,,,,
2,1,2014-01-03,7.6,1.9,14.0,3.679964,175.650289,5.8,12.1,11.039893,...,3,3,1,9.3,6.6,,,,,
3,1,2014-01-04,7.6,3.9,13.3,3.679964,291.000000,11.3,9.4,14.719857,...,4,4,1,7.6,9.3,6.6,,,,
4,1,2014-01-05,8.6,0.5,16.9,3.679964,175.650289,5.0,16.4,18.399821,...,5,5,1,7.6,7.6,9.3,6.6,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182333,112,2018-12-27,22.0,15.9,27.9,0.000000,100.000000,13.2,12.0,670978.229896,...,361,27,52,22.3,21.9,20.6,19.3,18.5,19.2,21.0
182334,112,2018-12-28,21.9,14.9,28.2,0.000000,91.000000,12.6,13.3,670978.229896,...,362,28,52,22.0,22.3,21.9,20.6,19.3,18.5,19.2
182335,112,2018-12-29,22.4,16.3,28.2,0.000000,61.000000,14.2,11.9,670978.229896,...,363,29,52,21.9,22.0,22.3,21.9,20.6,19.3,18.5
182336,112,2018-12-30,21.6,18.5,26.6,1.600000,70.000000,17.0,8.1,670979.829896,...,364,30,52,22.4,21.9,22.0,22.3,21.9,20.6,19.3


In [14]:
historical_weather = historical_weather.dropna()

In [15]:
# scaler = StandardScaler()
# scaler = scaler.fit(historical_weather[['avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm','avg_wind_dir_deg', 'avg_wind_speed_kmh']])
# historical_weather_scaled = scaler.transform(historical_weather[['avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm','avg_wind_dir_deg', 'avg_wind_speed_kmh']])

In [16]:
# def prepare_data_for_lstm(data, n_past=14, n_future=7):
#     X, Y = [], []
#     for i in range(n_past, len(data) - n_future + 1):
#         X.append(data[i - n_past:i, :])  # past days data
#         Y.append(data[i:i + n_future, 0])  # future days target (avg_temp_c)
#     return np.array(X), np.array(Y)

# trainX, trainY = prepare_data_for_lstm(historical_weather_scaled)

In [17]:
# # Define the LSTM model
# model = Sequential()
# model.add(LSTM(64, activation='relu', input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
# model.add(LSTM(32, activation='relu', return_sequences=False))
# model.add(Dropout(0.2))
# model.add(Dense(trainY.shape[1]))  # Output layer for predicting avg_temp_c

# model.compile(optimizer='adam', loss='mse')
# model.summary()

In [18]:
# history = model.fit(trainX, trainY, epochs=10, batch_size=16, validation_split=0.1, verbose=1)

In [19]:
# plt.plot(history.history['loss'], label='Training loss')
# plt.plot(history.history['val_loss'], label='Validation loss')
# plt.legend()

In [20]:
# predictions = []
# for index, row in test_data.iterrows():
#     city_id = row['city_id']
#     date = row['date']
#     city_data = historical_weather[(historical_weather['city_id'] == city_id) & (historical_weather['date'] <= date)]
#     city_data_scaled = scaler.transform(city_data[['avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm','avg_wind_dir_deg', 'avg_wind_speed_kmh']])
    
#     # Reshape data for LSTM input
#     X = np.expand_dims(city_data_scaled[-14:, :], axis=0)  # Using last 14 days data
#     prediction = model.predict(X)
#     predictions.append(prediction[0][0])  # Predicted avg_temp_c for the date

# # Create submission dataframe
# submission = pd.DataFrame({
#     'submission_ID': test_data['submission_ID'],
#     'avg_temp_c': predictions
# })

# # Save submission to CSV file
# submission.to_csv('my_submission.csv', index=False)

In [21]:
# Load and preprocess the data
# df = pd.read_csv('/mnt/data/historical_weather.csv')
# df['date'] = pd.to_datetime(df['date'])
# df = create_features(df)

from sklearn.preprocessing import MinMaxScaler

# Function to prepare data for LSTM
def prepare_lstm_data(df, city_id, sequence_length=30):
    city_df = df[df['city_id'] == city_id].sort_values('date')
    features = ['avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm','avg_wind_dir_deg', 'avg_wind_speed_kmh','dayofweek', 'quarter', 'month','year', 'dayofyear','dayofmonth','weekofyear','avg_temp_c_lag_1','avg_temp_c_lag_2',
                'avg_temp_c_lag_3','avg_temp_c_lag_4','avg_temp_c_lag_5','avg_temp_c_lag_6','avg_temp_c_lag_7']
    city_df = city_df[features].dropna()
    
    scaler = MinMaxScaler()
    city_scaled = scaler.fit_transform(city_df)
    
    X, y = [], []
    for i in range(sequence_length, len(city_scaled) - 7):
        X.append(city_scaled[i-sequence_length:i])
        y.append(city_scaled[i:i+7, 0])  # Predict next 7 days of avg_temp_c

    X, y = np.array(X), np.array(y)
    return X, y, scaler

# Function to build and train LSTM model
def train_lstm(X_train, y_train):
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(units=50))
    model.add(Dense(7))  # Predicting 7 days
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)
    return model

# Function to make predictions
def predict_next_7_days(model, X, scaler, num_features):
    prediction_scaled = model.predict(X[-1].reshape(1, X.shape[1], X.shape[2]))
    padding = np.zeros((7, num_features - 1))
    prediction = scaler.inverse_transform(np.hstack([prediction_scaled.reshape(-1, 1), padding]))[:, 0]
    return prediction

# List to store predictions for all cities
all_cities_predictions = []

# Iterate through all unique city IDs
unique_city_ids = historical_weather['city_id'].unique()

for city_id in unique_city_ids:
    X, y, scaler = prepare_lstm_data(historical_weather, city_id)
    if X.shape[0] > 0:  # Ensure there is enough data for training
        model = train_lstm(X, y)
        num_features = X.shape[2]
        predictions = predict_next_7_days(model, X, scaler, num_features)
        all_cities_predictions.append({
            'city_id': city_id,
            'predictions': predictions
        })

# Display the predictions
all_cities_predictions

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

[{'city_id': 1,
  'predictions': array([9.10085789, 9.2321787 , 9.27823824, 9.40708762, 9.82537316,
         9.82248133, 8.98164006])},
 {'city_id': 2,
  'predictions': array([15.46181645, 15.98717589, 15.89800155, 16.41179485, 15.60803583,
         15.58285034, 15.15273545])},
 {'city_id': 3,
  'predictions': array([26.66934131, 26.6943547 , 26.55779666, 26.56657813, 26.51457437,
         26.61774511, 26.6673884 ])},
 {'city_id': 4,
  'predictions': array([-0.78315079, -0.44990084, -0.70125847, -1.44932657, -0.18708348,
         -1.63010522, -1.94730885])},
 {'city_id': 5,
  'predictions': array([21.94118414, 22.5399091 , 21.86908946, 22.26152099, 21.7989493 ,
         22.02312257, 21.75671896])},
 {'city_id': 7,
  'predictions': array([18.63508084, 18.82743372, 18.11143695, 18.02829984, 18.08641085,
         18.42282801, 17.86191514])},
 {'city_id': 8,
  'predictions': array([3.16078183, 3.03701239, 2.72786849, 3.01703868, 3.1077681 ,
         2.98279572, 1.59576563])},
 {'city_id': 

In [26]:
len(all_cities_predictions)

100

In [27]:
results = []

for city_prediction in all_cities_predictions:
    city_id = city_prediction['city_id']
    predictions = city_prediction['predictions']
    for i, pred in enumerate(predictions):
        results.append({'city_id': city_id, 'day': i + 1, 'predicted_avg_temp_c': pred})

df_predictions = pd.DataFrame(results)

In [28]:
df_predictions

Unnamed: 0,city_id,day,predicted_avg_temp_c
0,1,1,9.100858
1,1,2,9.232179
2,1,3,9.278238
3,1,4,9.407088
4,1,5,9.825373
...,...,...,...
695,112,3,20.838765
696,112,4,20.931866
697,112,5,21.721258
698,112,6,22.215653


In [29]:
submission = pd.DataFrame({
    'submission_ID': test_data['submission_ID'],
    'avg_temp_c': df_predictions['predicted_avg_temp_c']
})

# Save submission to CSV file
submission.to_csv('submission_LSTM_3.csv', index=False)