In [222]:
from datetime import datetime as dt

import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor as dtr
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.ensemble import GradientBoostingRegressor as gbr

from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.model_selection import train_test_split

url='https://drive.google.com/file/d/1-4YpXkd2kIOM5viSRw8g7oOQm8sicciB/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
y_train = pd.read_csv(url, index_col=0)

url='https://drive.google.com/file/d/1-7VK3dNry2-AYnfRsxMWsOKhHHMTN_ZA/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
test_indices = pd.read_csv(url, index_col=0)

In [223]:
def month_to_season(month: int):
    if month < 4:
        return "winter"
    elif month < 7:
        return "spring"
    elif month < 10:
        return "summer"
    elif month < 13:
        return "fall"

In [224]:
df = pd.read_csv("weather.csv")

In [225]:
df["timestamp"] = df[["timestamp"]].apply(lambda x: f"{x[0][0:4]}-{x[0][4:6]}-{x[0][6:8]}", axis=1)

In [226]:
# df[["timestamp"]].apply(lambda x: f"{x[0][0:4]}-{x[0][4:6]}-{x[0][6:8]}", axis=1)
df = df.groupby("timestamp").mean()
df = df[df.index >= "2009-12-28"]

In [227]:
fitted_df = pd.DataFrame(df.iloc[0:7].sum() / 7, columns=[f"{df.index[0]}/{df.index[6]}"]).transpose()
for i in range(7, len(df.index), 7):
    fitted_df = pd.concat([fitted_df, pd.DataFrame(df.iloc[i:i + 7].sum() / 7, columns=[f"{df.index[i]}/{df.index[i + 6]}"]).transpose()])

In [228]:
fitted_df.index.name="year_weeks"

In [229]:
y_train = pd.concat([y_train, fitted_df.iloc[0:len(y_train.index)]], axis=1)
y_train.index.name = "year_weeks"

In [244]:
train = y_train.reset_index()
melted_train = pd.melt(train, id_vars=["year_weeks", "Temperature", "Relative Humidity", "Wind Speed", "Wind Direction"], var_name="City", value_name="Weight")
melted_train["temp"] = melted_train[["City"]].apply(lambda x: x[0].split("_"), axis=1)
melted_train[["food", "district"]] = pd.DataFrame(melted_train["temp"].tolist())

melted_train["month"] = list(map(lambda x: int(x.split("/")[1].split("-")[1]), melted_train.year_weeks))
melted_train["day"] = list(map(lambda x: int(x.split("/")[1].split("-")[2]), melted_train.year_weeks))
melted_train["season"] = list(map(lambda x: month_to_season(x), melted_train.month))

melted_train = melted_train[["year_weeks", "month", "day", "season", "year_weeks", "Temperature",
                             "Relative Humidity", "Wind Speed", "Wind Direction", "food", "district", "Weight"]]

X_train_df = pd.concat([pd.get_dummies(melted_train["month"]),
                        # pd.get_dummies(melted_train["day"]),
                        # pd.get_dummies(melted_train["season"]),
                        # melted_train["Temperature"],
                        # melted_train["Relative Humidity"],
                        # melted_train["Wind Speed"],
                        # melted_train["Wind Direction"],
                        pd.get_dummies(melted_train["food"]),
                        pd.get_dummies(melted_train["district"])], axis=1)

X_train = X_train_df.to_numpy()
Y_train = melted_train["Weight"].to_numpy()


In [245]:
ddd = pd.DataFrame(y_train.sum(), columns=["sum"])
zeros = ddd[ddd["sum"] == 0].index.tolist()

In [246]:
X_train_08, X_test_02, Y_train_08, Y_test_02 = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)


param = {'max_depth': 5, 'eta': 0.74, 'gamma': 0.2, 'objective': 'reg:tweedie'}
param['nthread'] = 8
param['eval_metric'] = 'auc'

# dtrain = xgb.DMatrix(X_train_08, label=Y_train_08)
dtrain = xgb.DMatrix(X_train, label=Y_train)

num_rounds = 81
bst = xgb.train(param, dtrain, num_rounds)
# print(X_train_08)

In [247]:
dval = xgb.DMatrix(X_train)
# bst.predict(dtest)

In [248]:
Y_pred = bst.predict(dval)
Y_pred = np.array(map(lambda x: x if x >= 0 else 0, Y_pred))
melted_train["pred"] = Y_pred
for e in zeros:
    food, location = e.split("_")
    melted_train.loc[(melted_train["food"] == food) & (melted_train["district"] == location), ["pred"]] = 0
Y_pred = melted_train["pred"].to_numpy()

In [249]:
mean_squared_error(Y_train, Y_pred, squared=False)

5616.7362071307425

In [250]:
test_indices.reset_index(inplace=True)
if "index" in test_indices.columns:
    test_indices.drop(columns=["index"], inplace=True)
test_indices["month"] = list(map(lambda x: int(x.split("/")[1].split("-")[1]), test_indices.year_weeks))
test_indices["day"] = list(map(lambda x: int(x.split("/")[1].split("-")[2]), test_indices.year_weeks))
test_indices["season"] = list(map(lambda x: month_to_season(x), test_indices.month))

In [251]:
foodDistrict = melted_train[["food", "district"]].drop_duplicates()
foodDistrict.reset_index(inplace=True)
foodDistrict.drop(columns="index", inplace=True)

In [252]:
X_cross = test_indices.merge(foodDistrict, how="cross")
X_cross = X_cross.merge(fitted_df, on="year_weeks", how="inner")
X_test_df = pd.concat([pd.get_dummies(X_cross["month"]),
                       # pd.get_dummies(X_cross["day"]),
                       # pd.get_dummies(X_cross["season"]),
                       # X_cross["Temperature"],
                       # X_cross["Relative Humidity"],
                       # X_cross["Wind Speed"],
                       # X_cross["Wind Direction"],
                       pd.get_dummies(X_cross["food"]),
                       pd.get_dummies(X_cross["district"])], axis=1)
X_test = X_test_df.to_numpy()
# print(X_test)

In [253]:
print('predicting')
Y_pred = bst.predict(xgb.DMatrix(X_test))
Y_pred = np.array(map(lambda x: x if x >= 0 else 0, Y_pred))
print("predicted")
X_cross["pred"] = Y_pred
print("added to x_cross")
for e in zeros:
    food, location = e.split("_")
    X_cross.loc[(X_cross["food"] == food) & (X_cross["district"] == location), ["pred"]] = 0
print("zeros")
Y_pred = X_cross["pred"].to_numpy()

predicting
predicted
added to x_cross
zeros


In [254]:
X_cross["Weight"] = Y_pred

In [255]:
X_cross

Unnamed: 0,year_weeks,month,day,season,food,district,Temperature,Relative Humidity,Wind Speed,Wind Direction,pred,Weight
0,2020-01-13/2020-01-19,1,19,winter,other,יהודה ושומרון,5.953981,68.345238,9.402214,173.161423,5.506851,5.506851
1,2020-01-13/2020-01-19,1,19,winter,other,מחוז הדרום,5.953981,68.345238,9.402214,173.161423,2162.872559,2162.872559
2,2020-01-13/2020-01-19,1,19,winter,other,מחוז המרכז,5.953981,68.345238,9.402214,173.161423,1798.610229,1798.610229
3,2020-01-13/2020-01-19,1,19,winter,other,מחוז הצפון,5.953981,68.345238,9.402214,173.161423,2881.233643,2881.233643
4,2020-01-13/2020-01-19,1,19,winter,other,מחוז חיפה,5.953981,68.345238,9.402214,173.161423,294.028931,294.028931
...,...,...,...,...,...,...,...,...,...,...,...,...
24623,2022-07-11/2022-07-17,7,17,summer,תפוחים,מחוז הדרום,23.255172,45.988095,7.823682,153.208377,500.202972,500.202972
24624,2022-07-11/2022-07-17,7,17,summer,תפוחים,מחוז המרכז,23.255172,45.988095,7.823682,153.208377,135.008774,135.008774
24625,2022-07-11/2022-07-17,7,17,summer,תפוחים,מחוז הצפון,23.255172,45.988095,7.823682,153.208377,1035.848755,1035.848755
24626,2022-07-11/2022-07-17,7,17,summer,תפוחים,מחוז חיפה,23.255172,45.988095,7.823682,153.208377,330.787445,330.787445


In [256]:
def revert(df):
    df = pd.concat([df["year_weeks"], df[["food", "district"]].apply(lambda x: f"{x[0]}_{x[1]}", axis=1).rename("placeXfood"), df["Weight"]], axis=1)
    print("concat")
    df = df.set_index(["year_weeks", "placeXfood"])["Weight"].unstack()
    print("stacked")
    df.columns.name = None
    return df
finaldf = revert(X_cross[["year_weeks", "food", "district", "Weight"]])

concat
stacked


In [257]:
finaldf.to_csv("Answers.csv")