In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tqdm.notebook import tqdm

import matplotlib
from matplotlib import rc

rc("font", **{"family": "sans-serif", "sans-serif": "DejaVu Sans"})
rc("figure", **{"dpi": 200})
rc(
    "axes",
    **{"spines.right": False, "spines.top": False, "xmargin": 0.0, "ymargin": 0.05}
)
pd.set_option("display.max_rows", 200)


In [None]:
df = pd.read_csv(
    '../input/eurecom-aml-2022-challenge-1/public/train.csv', low_memory=True)


In [None]:
df_test = pd.read_csv(
    '../input/eurecom-aml-2022-challenge-1/public/test_feat.csv', low_memory=True)


In [None]:
df.head()


# Data analysis

In this challenge, you are free (and encouraged) to explore in depth the data you have, you can run simple queries on the data, perform exploration and compute statistics.

**NOTE**: finding the right question to ask is difficult! Don't be afraid to complement your analysis with your own questions. This can give you extra points!

**NOTE 2**: the presentation quality is critical in any business-oriented data analysis. Take time to create few but informative plots, rather than endless tables!


In [None]:
tr_coordinates = df[['fact_latitude', 'fact_longitude']].drop_duplicates()
te_coordinates = df_test[['fact_latitude', 'fact_longitude']].drop_duplicates()


In [None]:
merged = te_coordinates.merge(tr_coordinates, how='left', indicator=True)
tropic_coordinates = merged[merged['_merge'] == 'left_only']
common_coordinates = merged[merged['_merge'] == 'both']


In [None]:
fig, (ax0, ax1) = plt.subplots(1, 2, figsize=[
    10, 3], dpi=200, sharex=True, sharey=True)
ax0.scatter(tr_coordinates['fact_longitude'], tr_coordinates['fact_latitude'],
            s=1, c='tab:green', label='Train data')
ax1.scatter(common_coordinates['fact_longitude'], common_coordinates['fact_latitude'],
            s=1, c='tab:orange', label='In-domain test data')
ax1.scatter(tropic_coordinates['fact_longitude'], tropic_coordinates['fact_latitude'],
            s=1, c='tab:red', label='Out-domain test data')
ax0.legend(), ax1.legend()
fig.suptitle('Distribution of train and test points in the dataset',
             y=1.01, fontsize=14)
plt.show()


\***\*Time VS Temeprature Analysis\*\***


In [None]:
# Understanding the time format and converting it into days, months and years

print(df['fact_time'])
b = pd.to_datetime(df['fact_time'], unit='s')

b = b.to_frame()
targ = df['fact_temperature']
# mergedtime = b.merge(targ, how='left', indicator=True)
timetemp = []
timetemp = b.join(targ)


timetemp['Date'] = pd.to_datetime(timetemp['fact_time']).dt.date
timetemp['Time'] = pd.to_datetime(timetemp['fact_time']).dt.time
timetemp['Month'] = pd.to_datetime(timetemp['fact_time']).dt.month
timetemp['Weekday'] = pd.to_datetime(timetemp['fact_time']).dt.weekday


timetemp.insert(6, "Temp", timetemp['fact_temperature'])
timetemp.pop('fact_temperature')


print(timetemp)


In [None]:
# Number of months and years over which the data was collected

z = pd.to_datetime(timetemp['Date']).dt.year
print(z.unique())

print(timetemp['Month'].unique())


In [None]:
day = timetemp.groupby('Weekday').Temp.agg(['mean'])
day_w = ['sun', 'mon', 'tue', 'wed', 'thurs', 'fri', 'sat']

# print(day['mean'])
# print(day_w)


ax1 = sns.barplot(day_w, day['mean'])
ax1.bar_label(ax1.containers[0])

ax1.set_xlabel("Day_of_week", fontsize=10)
ax1.set_ylabel("Mean temperature", fontsize=10)
ax1.set_title('Day_of_week Vs Mean Temperature Analysis')


The mean teamperature over all the days is almost same. So we will now go to a more deeper analysis.


In [None]:
monthm = timetemp.groupby('Month').Temp.agg(['mean'])

month = timetemp.Month.unique()

ax = sns.barplot(np.sort(month), monthm['mean'])
ax.bar_label(ax.containers[0])
ax.set_xlabel("Month", fontsize=10)
ax.set_ylabel("Average temperature", fontsize=10)
ax.set_title('Month Vs Mean Temperature Analysis')


The months over whihc data was considered were Sepetember, October, November and December of 2018, and January and February of 2019. The average temeperature is high during the Sepetember and October months as they are almost summer. The temperature keeps on decerasing from November till January due to the winter days. The temeperature starts increasing from February again as it is begining of Spring.


In [None]:
data = timetemp.groupby('Month').Temp.agg(['mean', 'min', 'max'])
data.reset_index(inplace=True)
print(data)

months = data['Month']
means = data['mean']
mins = data['min']
maxs = data['max']


plt.errorbar(months, means, fmt='ok')
plt.errorbar(months, means, [means - mins, maxs - means],
             ecolor='blue', lw=1)
plt.xlim(0, 13)
plt.title('Monthly Analysis')

# ax1 = sns.boxplot(months,means)

ax = data.plot(x='Month', y='mean', c='white')
plt.fill_between(x='Month', y1='min', y2='max', data=data)


The minimum, maximum and the average temperature were plotted and the behaviour was as expected as shown in the plots.


In [None]:
# OCTOBER
octt = timetemp.loc[timetemp['Month'] == 10]
# df.loc[df['col1'] == value]

days = octt.groupby('Weekday').Temp.agg(['mean'])

data = octt.groupby('Weekday').Temp.agg(['mean', 'min', 'max'])
data.reset_index(inplace=True)
# print(data)

days = data['Weekday']
means = data['mean']
mins = data['min']
maxs = data['max']

fig, ax = plt.subplots(2, 3, sharex=True, figsize=(10, 5))

ax[0, 1].errorbar(days, means, fmt='ok')
ax[0, 1].errorbar(days, means, [means - mins, maxs - means],
                  ecolor='blue', lw=1)
ax[0, 1].set_xlabel("October Days", fontsize=8)
ax[0, 1].set_ylabel("Temparature", fontsize=8)
ax[0, 1].set_ylim([-50, 60])


plt.xlim(-1, 7)

fig.suptitle('Monthly Days vs Temperature Analysis')
# print(octt)

# SEPTEMBER
sept = timetemp.loc[timetemp['Month'] == 9]
# df.loc[df['col1'] == value]

days1 = sept.groupby('Weekday').Temp.agg(['mean'])

data1 = sept.groupby('Weekday').Temp.agg(['mean', 'min', 'max'])
data1.reset_index(inplace=True)
# print(data)

days1 = data1['Weekday']
means1 = data1['mean']
mins1 = data1['min']
maxs1 = data1['max']


ax[0, 0].errorbar(days1, means1, fmt='ok')
ax[0, 0].errorbar(days1, means1, [means1 - mins1, maxs1 - means1],
                  ecolor='blue', lw=1)
ax[0, 0].set_xlabel("September Days", fontsize=8)
ax[0, 0].set_ylabel("Temparature", fontsize=8)
ax[0, 0].set_ylim([-50, 60])


# print(data1)

# NOVEMBER
nov = timetemp.loc[timetemp['Month'] == 11]
# df.loc[df['col1'] == value]

days2 = nov.groupby('Weekday').Temp.agg(['mean'])

data2 = nov.groupby('Weekday').Temp.agg(['mean', 'min', 'max'])
data2.reset_index(inplace=True)
# print(data)

days2 = data2['Weekday']
means2 = data2['mean']
mins2 = data2['min']
maxs2 = data2['max']


ax[0, 2].errorbar(days2, means2, fmt='ok')
ax[0, 2].errorbar(days2, means2, [means2 - mins2, maxs2 - means2],
                  ecolor='blue', lw=1)
ax[0, 2].set_xlabel("November Days", fontsize=8)
ax[0, 2].set_ylabel("Temparature", fontsize=8)
ax[0, 2].set_ylim([-50, 60])


# DECEMBER
dec = timetemp.loc[timetemp['Month'] == 12]
# df.loc[df['col1'] == value]

days3 = dec.groupby('Weekday').Temp.agg(['mean'])

data3 = dec.groupby('Weekday').Temp.agg(['mean', 'min', 'max'])
data3.reset_index(inplace=True)
# print(data)

days3 = data3['Weekday']
means3 = data3['mean']
mins3 = data3['min']
maxs3 = data3['max']


ax[1, 0].errorbar(days3, means3, fmt='ok')
ax[1, 0].errorbar(days3, means3, [means3 - mins3, maxs3 - means3],
                  ecolor='blue', lw=1)
ax[1, 0].set_xlabel("December Days", fontsize=8)
ax[1, 0].set_ylabel("Temparature", fontsize=8)
ax[1, 0].set_ylim([-50, 60])


# JANUARY
jan = timetemp.loc[timetemp['Month'] == 1]
# df.loc[df['col1'] == value]

days4 = jan.groupby('Weekday').Temp.agg(['mean'])

data4 = jan.groupby('Weekday').Temp.agg(['mean', 'min', 'max'])
data4.reset_index(inplace=True)
# print(data)

days4 = data4['Weekday']
means4 = data4['mean']
mins4 = data4['min']
maxs4 = data4['max']


ax[1, 1].errorbar(days4, means4, fmt='ok')
ax[1, 1].errorbar(days4, means4, [means4 - mins4, maxs4 - means4],
                  ecolor='blue', lw=1)
ax[1, 1].set_xlabel("January Days", fontsize=8)
ax[1, 1].set_ylabel("Temparature", fontsize=8)
ax[1, 0].set_ylim([-50, 60])


# FEBRUARY
feb = timetemp.loc[timetemp['Month'] == 2]
# df.loc[df['col1'] == value]

days5 = feb.groupby('Weekday').Temp.agg(['mean'])

data5 = feb.groupby('Weekday').Temp.agg(['mean', 'min', 'max'])
data5.reset_index(inplace=True)
# print(data)

days5 = data5['Weekday']
means5 = data5['mean']
mins5 = data5['min']
maxs5 = data5['max']


ax[1, 2].errorbar(days5, means5, fmt='ok')
ax[1, 2].errorbar(days5, means5, [means5 - mins5, maxs5 - means5],
                  ecolor='blue', lw=1)
ax[1, 2].set_xlabel("February Days", fontsize=8)
ax[1, 2].set_ylabel("Temparature", fontsize=8)
ax[1, 2].set_ylim([-50, 60])


The temperature of all the days over all the months were plotted as the information was not clear about the day of the week and temeperature. During the weekends, the temeperature was at its extremeties (highest during the summer and low during the winter months). The rest of the days almost showed similar behaviour as with the other days during that particular month.


**Statical Information about the dataset**


In [None]:
df.describe()


In [None]:
# Checking for datatypes
df.info()

**Missing Data per column**


In [None]:
df_missing = df.isna().sum()
df_missing

**SKEW Plot**


In [None]:
from scipy import stats

# Extract training and testing data\\\

testcol60 = [
    'index',
    'wrf_t2_interpolated',
    'wrf_t2_next',
    'gfs_temperature_97500',
    'gfs_temperature_95000',
    'climate_temperature',
    'gfs_temperature_92500',
    'gfs_temperature_90000',
    'gfs_temperature_85000',
    'gfs_temperature_80000',
    'cmc_0_0_6_2',
    'gfs_temperature_75000',
    'gfs_temperature_70000',
    'gfs_temperature_65000',
    'cmc_0_1_0_0',
    'gfs_temperature_60000',
    'gfs_temperature_55000',
    'gfs_temperature_50000',
    'gfs_temperature_45000',
    'gfs_temperature_40000',
    'gfs_2m_dewpoint_next',
    'gfs_2m_dewpoint',
    'gfs_temperature_35000',
    'gfs_temperature_30000',
    'cmc_0_3_5_500',
    'gfs_precipitable_water',
    'gfs_temperature_25000',
    'cmc_0_3_5_700',
    'gfs_temperature_10000',
    'gfs_temperature_15000',
    'fact_time',
    'gfs_temperature_7000',
    'fact_latitude',
    'cmc_0_0_7_2',
    'cmc_0_3_1_0',
    'sun_elevation',
    'cmc_0_3_5_1000',
    'cmc_0_3_5_850',
    'gfs_humidity',
    'cmc_0_2_2_500',
    'cmc_0_2_2_700',
    'cmc_0_1_66_0_next',
    'cmc_0_1_66_0',
    'gfs_temperature_20000',
    'gfs_a_vorticity',
    'wrf_rh2',
    'gfs_total_clouds_cover_middle',
    'gfs_total_clouds_cover_low',
    'gfs_total_clouds_cover_low_next',
    'cmc_0_2_2_850',
    'cmc_0_0_7_1000',
    'gfs_cloudness',
    'topography_bathymetry',
    'fact_longitude',
    'cmc_0_2_2_925',
    'cmc_0_2_2_10',
    'climate_pressure',
    'gfs_r_velocity',
    'cmc_0_3_0_0',
    'cmc_0_1_66_0_grad',
    'gfs_pressure',
    'fact_temperature'
]


# Check skewness of 60 features
skew_val = list(df[testcol60].skew())


plt.plot(skew_val)
plt.ylim(-3, 21)
plt.title("Skew Plot of most related features")

# Add labels

plt.xlabel("i(th) Feature\n Skew Plot")
plt.ylabel("Skew Score")
plt.savefig('/kaggle/working/skew_plot.png')
plt.show()


The overall dataset is very slightly skewed with a skew score of -0.077248, with individual columns going as high as 21.0. The skew scores are seen on the plot.


**Distribution**


In [None]:
df.hist(figsize=(40, 40))
plt.show()


The data distribution for all the features are checked for normal distribution using the Shapiro-Wilk  	 Test. Features that are deviating from the normal bell have a p-value of 0, while the others have 1.0.

In [None]:
from scipy.stats import shapiro

cols = df.columns
stat=[]
p=[]
for i in cols:
    stat_i, p_i = shapiro(df[i])
    stat.append(i)
    p.append(p_i)

plt.plot(p)
plt.title("Normality Check of All features")

# Add labels

plt.xlabel("i(th) Feature")
plt.ylabel("P-Value")
plt.show()

# Data Pre-processing

The previous step should give you a better understanding of which pre-processing is required for the data. This may include:

- Normalising and standardising the given data;
- Removing outliers;
- Carrying out feature selection, possibly using metrics derived from information theory;
- Handling missing information in the dataset;
- Augmenting the dataset with external information;
- Combining existing features.

Below is a very basic example of pre-processing steps.


**Handling Missing Information**

We resolve all the missing values by putting the mean.


In [None]:
df.fillna(df.mean(), inplace=True)

# # REsolver all missing values by putting the mean value
# from sklearn.impute import SimpleImputer
# imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')
# imp_mean.fit(df.iloc[:,1:-1])
# df = imp_mean.transform(df.iloc[:,1:-1])


In [None]:
df.head()


# **Feature Selection**


**Correlation Matrix**


In [None]:
corrmat = np.abs(df).corr()
mask = np.zeros_like(corrmat)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(30, 30))
    ax = sns.heatmap(corrmat, mask=mask, square=True, cmap="YlGnBu")


**UNIVARIATE Selection**


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

bestfeatures = SelectKBest(score_func=f_regression, k=60)
fit = bestfeatures.fit(df.iloc[:, 1:-1], df.iloc[:, -1])
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df.columns)
# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']
print(featureScores.nlargest(60, 'Score'))


**Mutual Information betwen features**


In [None]:
# from sklearn.feature_selection import mutual_info_regression
# mutual_info = mutual_info_regression(df.iloc[:, 1:-1], df.iloc[:, -1])
# mutual_info = pd.Series(mutual_info)
# mutual_info.index = df.iloc[:, 1:-1].columns
# mutual_info.sort_values(ascending=False).plot.bar(figsize=(30, 30))


We select features with:

- High correlation with fact_tempearature
- High mutual information betwen the features and fact_tempearature]
- Best features from univariate selection


In [None]:
col_selection = [
    'index',
    'wrf_t2_interpolated',
    'wrf_t2_next',
    'gfs_temperature_97500',
    'gfs_temperature_95000',
    'climate_temperature',
    'gfs_temperature_92500',
    'gfs_temperature_90000',
    'gfs_temperature_85000',
    'gfs_temperature_80000',
    'cmc_0_0_6_2',
    'gfs_temperature_75000',
    'gfs_temperature_70000',
    'gfs_temperature_65000',
    'cmc_0_1_0_0',
    'gfs_temperature_60000',
    'gfs_temperature_55000',
    'gfs_temperature_50000',
    'gfs_temperature_45000',
    'gfs_temperature_40000',
    'gfs_2m_dewpoint_next',
    'gfs_2m_dewpoint',
    'gfs_temperature_35000',
    'gfs_temperature_30000',
    'cmc_0_3_5_500',
    'gfs_precipitable_water',
    'gfs_temperature_25000',
    'cmc_0_3_5_700',
    'gfs_temperature_10000',
    'gfs_temperature_15000',
    'fact_time',
    'gfs_temperature_7000',
    'fact_latitude',
    'cmc_0_0_7_2',
    'cmc_0_3_1_0',
    'sun_elevation',
    'cmc_0_3_5_1000',
    'cmc_0_3_5_850',
    'gfs_humidity',
    'cmc_0_2_2_500',
    'cmc_0_2_2_700',
    'cmc_0_1_66_0_next',
    'cmc_0_1_66_0',
    'gfs_temperature_20000',
    'gfs_a_vorticity',
    'wrf_rh2',
    'gfs_total_clouds_cover_middle',
    'gfs_total_clouds_cover_low',
    'gfs_total_clouds_cover_low_next',
    'cmc_0_2_2_850',
    'cmc_0_0_7_1000',
    'gfs_cloudness',
    'topography_bathymetry',
    'fact_longitude',
    'cmc_0_2_2_925',
    'cmc_0_2_2_10',
    'climate_pressure',
    'gfs_r_velocity',
    'cmc_0_3_0_0',
    'cmc_0_1_66_0_grad',
    'gfs_pressure',
    'fact_temperature'
]


In [None]:
X = df[col_selection].dropna().iloc[:, 1:-1].values
y = df[col_selection].dropna().iloc[:, -1].values


In [None]:
Xmean, Xstd, ymean, ystd = X.mean(0), X.std(0), y.mean(), y.std()
Xmin, Xmax, ymin, ymax = X.min(axis=0), X.max(axis=0), y.min(axis=0), y.max(axis=0)
X = (X - Xmean) / Xstd
y = (y - ymean) / ystd


# Model Selection

Perhaps one of the most important segments of this challenge involves the selection of a model that can successfully handle the given data and yield sensible predictions. Instead of focusing exclusively on your final chosen model, it is also important to share your thought process in this notebook by additionally describing alternative candidate models. There is a wealth of models to choose from, such as decision trees, random forests, (Bayesian) neural networks, Gaussian processes, Lasso regression, and so on. There are several factors which may influence your decision:

- What is the model's complexity?
- Is the model interpretable?
- Is the model capable of handling different data-types?
- Does the model return uncertainty estimates along with predictions?

In this baseline solution, we use the Lasso regression model, which is a linear least-square model with L1 regularization on its parameters. There is a hyper-parameter that should be tuned which is the regularization strength α. Intuitively, this hyper-parameter controls the amount of shrinkage of the parameters of the model: the larger the value of α the greater the amount of shrinkage.

Section 3.4.1 of the book The Elements of Statistical Learning: Data Mining, Inference, and Prediction from Trevor Hastie et al. (https://hastie.su.domains/Papers/ESLII.pdf) is a good reference for some classic regression models.


In [None]:
from sklearn.model_selection import train_test_split
Xtr, Xval, ytr, yval = train_test_split(X, y, random_state=1, test_size=10000)


In [None]:
def compute_rmse(y, ypred, ystd=1.):
    return np.mean((y - ypred)**2)**0.5 * ystd


Find Best alpha for Lasso model


In [None]:
from sklearn.linear_model import LassoCV

# Lasso with 5 fold cross-validation
lasso_modelCV = LassoCV(cv=10, random_state=0, max_iter=10000)

# Fit model
lasso_modelCV.fit(Xtr, ytr)

alpha = lasso_modelCV.alpha_
print(alpha)


In [None]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=alpha)
lasso_model.fit(Xtr, ytr)
ypred_tr = lasso_model.predict(Xtr)
ypred_val = lasso_model.predict(Xval)

print(f'Train RMSE: {compute_rmse(ytr, ypred_tr, ystd):.3f}')
print(f'Valid RMSE: {compute_rmse(yval, ypred_val, ystd):.3f}')


In [None]:
from sklearn.tree import DecisionTreeRegressor

# Build decision tree
decison_tree_model = DecisionTreeRegressor(max_depth=10, random_state=0)
decison_tree_model.fit(Xtr, ytr)

ypred_tr = decison_tree_model.predict(Xtr)
ypred_val = decison_tree_model.predict(Xval)

print(f'Train RMSE: {compute_rmse(ytr, ypred_tr, ystd):.3f}')
print(f'Valid RMSE: {compute_rmse(yval, ypred_val, ystd):.3f}')


In [None]:
from xgboost import XGBRegressor

xgboost_model = XGBRegressor(max_depth=5, subsample=0.8, colsample_bytree=0.8)
xgboost_model.fit(Xtr, ytr)

ypred_tr = xgboost_model.predict(Xtr)
ypred_val = xgboost_model.predict(Xval)

print(f'Train RMSE: {compute_rmse(ytr, ypred_tr, ystd):.3f}')
print(f'Valid RMSE: {compute_rmse(yval, ypred_val, ystd):.3f}')


In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor(n_estimators = 50, random_state = 50, bootstrap= True, 
                                            max_depth= 80, max_features= 'auto',min_samples_leaf= 4,
                                            min_samples_split=10)

random_forest_model.fit(Xtr, ytr)

ypred_tr = random_forest_model.predict(Xtr)
ypred_val = random_forest_model.predict(Xval)

print(f'Train RMSE: {compute_rmse(ytr, ypred_tr, ystd):.3f}')
print(f'Valid RMSE: {compute_rmse(yval, ypred_val, ystd):.3f}')


# Parameter Optimisation

Irrespective of your choice, it is highly likely that your model will have one or more parameters that require tuning. There are several techniques for carrying out such a procedure, including cross-validation, Bayesian optimisation, and several others. As before, an analysis into which parameter tuning technique best suits your model is expected before proceeding with the optimisation of your model.

The below cells demonstrate tuning the hyper-parameters α of the Ridge regression model by using cross-validation.


In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


In [None]:
import numpy as np


def get_cv_idx(n, test_size=0.2, n_splits=2):
    train_idx, test_idx = [], []
    for _ in range(n_splits):
        idx = np.random.permutation(n)
        train_size = int(n * (1 - test_size)
                         ) if isinstance(test_size, float) else n - test_size
        train_idx.append(idx[:train_size])
        test_idx.append(idx[train_size:])
    return train_idx, test_idx


In [None]:
train_idx, cv_idx = get_cv_idx(len(Xtr), test_size=10000, n_splits=10)


**Random Forest Optimization**


In [None]:
# param_grid = {
#     "max_depth": [30, 40, 60]
# }

# search_random_forest = GridSearchCV(random_forest_model,
#                       param_grid,
#                       n_jobs=-1,
#                       verbose=1,
#                       cv=zip(train_idx, cv_idx),
#                       scoring='neg_root_mean_squared_error').fit(Xtr, ytr)
# print('Done!')


In [None]:
# print("Best parameters set found on cv set:")
# print(search_random_forest.best_params_)
# print()
# print("Grid scores on cv set:")
# means = search_random_forest.cv_results_["mean_test_score"]
# stds = search_random_forest.cv_results_["std_test_score"]
# for mean, std, params in zip(means, stds, search_random_forest.cv_results_["params"]):
#     print("%0.3f (+/-%0.03f) for %r" % (-mean * ystd, (std * ystd) * 2, params))
# print()
# print("Error on the validation set")
# ypred_val = search_lasso.predict(Xval)
# print(f'Valid RMSE: {compute_rmse(yval, ypred_val, ystd):.3f}')


**Decison Tree Optimization**


In [None]:
# param_grid = {
#     "max_depth": [20, 40, 70]
# }

# search_decison_tree = GridSearchCV(decison_tree_model,
#                       param_grid,
#                       n_jobs=-1,
#                       verbose=1,
#                       cv=zip(train_idx, cv_idx),
#                       scoring='neg_root_mean_squared_error').fit(Xtr, ytr)
# print('Done!')


In [None]:
# print("Best parameters set found on cv set:")
# print(search_decison_tree.best_params_)
# print()
# print("Grid scores on cv set:")
# means = search_decison_tree.cv_results_["mean_test_score"]
# stds = search_decison_tree.cv_results_["std_test_score"]
# for mean, std, params in zip(means, stds, search_decison_tree.cv_results_["params"]):
#     print("%0.3f (+/-%0.03f) for %r" % (-mean * ystd, (std * ystd) * 2, params))
# print()
# print("Error on the validation set")
# ypred_val = search_decison_tree.predict(Xval)
# print(f'Valid RMSE: {compute_rmse(yval, ypred_val, ystd):.3f}')


# Model Evaluation

Some form of pre-evaluation will inevitably be required in the preceding sections in order to both select an appropriate model and configure its parameters appropriately. In this final section, you may evaluate other aspects of the model such as:

- Assessing the running time of your model;
- Determining whether some aspects can be parallelised;
- Training the model with smaller subsets of the data.
- etc.

Remember, the goal of this challenge is to construct a model for predicting the temperature around the globe.


In [None]:
df_test = pd.read_csv(
    '../input/eurecom-aml-2022-challenge-1/public/test_feat.csv', low_memory=True)


In [None]:
col_selection = [
    'index',
    'wrf_t2_interpolated',
    'wrf_t2_next',
    'gfs_temperature_97500',
    'gfs_temperature_95000',
    'climate_temperature',
    'gfs_temperature_92500',
    'gfs_temperature_90000',
    'gfs_temperature_85000',
    'gfs_temperature_80000',
    'cmc_0_0_6_2',
    'gfs_temperature_75000',
    'gfs_temperature_70000',
    'gfs_temperature_65000',
    'cmc_0_1_0_0',
    'gfs_temperature_60000',
    'gfs_temperature_55000',
    'gfs_temperature_50000',
    'gfs_temperature_45000',
    'gfs_temperature_40000',
    'gfs_2m_dewpoint_next',
    'gfs_2m_dewpoint',
    'gfs_temperature_35000',
    'gfs_temperature_30000',
    'cmc_0_3_5_500',
    'gfs_precipitable_water',
    'gfs_temperature_25000',
    'cmc_0_3_5_700',
    'gfs_temperature_10000',
    'gfs_temperature_15000',
    'fact_time',
    'gfs_temperature_7000',
    'fact_latitude',
    'cmc_0_0_7_2',
    'cmc_0_3_1_0',
    'sun_elevation',
    'cmc_0_3_5_1000',
    'cmc_0_3_5_850',
    'gfs_humidity',
    'cmc_0_2_2_500',
    'cmc_0_2_2_700',
    'cmc_0_1_66_0_next',
    'cmc_0_1_66_0',
    'gfs_temperature_20000',
    'gfs_a_vorticity',
    'wrf_rh2',
    'gfs_total_clouds_cover_middle',
    'gfs_total_clouds_cover_low',
    'gfs_total_clouds_cover_low_next',
    'cmc_0_2_2_850',
    'cmc_0_0_7_1000',
    'gfs_cloudness',
    'topography_bathymetry',
    'fact_longitude',
    'cmc_0_2_2_925',
    'cmc_0_2_2_10',
    'climate_pressure',
    'gfs_r_velocity',
    'cmc_0_3_0_0',
    'cmc_0_1_66_0_grad',
    'gfs_pressure'
]


Xte = df_test[col_selection].iloc[:, 1:].values
Xte = (Xte - Xmean) / Xstd


First we re train our best model using the whole dataset, with no spliting for validation (more data theorical mean better performance)


In [None]:
random_forest_model.fit(X, y)

In [None]:
# Remember to un-standardize the predictions
ypred_te = random_forest_model.predict(Xte) * ystd + ymean


# Submission

Your submission is a CSV file containing your final model's predictions on the given test data. This file should contain a header and have the following format:

```
index,fact_temperature
1993574,3.9865149124872303
1993575,18.165092058370533
1993576,16.53315442160854
1993577,8.377598784006866
...
```

A leaderboard for this challenge will be ranked using the root mean squared error between the predicted values and the observed arrival delays. However, you can use other metrics for regression tasks in your presentation notebook to evaluate many aspects of your model, including quantification of the uncertainty in the predictions.

Below is an example of creating a submission file.


In [None]:
submission_df = pd.DataFrame(data={'index': df_test['index'].values,
                                   'fact_temperature': ypred_te.squeeze()})

# Save the predictions into a csv file
# Notice that this file should be saved under the directory `/kaggle/working`
# so that you can download it later
submission_df.to_csv("/kaggle/working/submission.csv", index=False)


In [None]:
# Check the submission file
! head - 6 "/kaggle/working/submission.csv"
