# Loading Data

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from prettytable import PrettyTable
import geopandas as gpd
import matplotlib
import folium
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split ,LeaveOneGroupOut
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor as lgbm
from sklearn.metrics import mean_squared_error ,accuracy_score

In [None]:
# read data
train_data = pd.read_csv('/kaggle/input/playground-series-s3e20/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e20/test.csv')

In [None]:
# show a sample of train_data
print(train_data.head())

# show a sample of test_data
print(test_data.tail())

In [None]:
# show column
train_data.columns

In [None]:
# train_data shape
print(train_data.shape)

# test_data shape
print(test_data.shape)

In [None]:
# train_dta info
print(train_data.info())

# test_data info
print(test_data.info())

In [None]:
# data description
train_data.describe()

In [None]:
test_data.describe()

In [None]:
# Data important info
data_table = PrettyTable()
data_table.field_names = ['Label','Data Shape' ,'N_nulls','N_nonNulls','Data Size']
data_table.add_row(['Train Data' , train_data.shape , train_data.isna().sum().sum(),train_data.shape[0] - train_data.isna().sum().sum()
, train_data.size])
data_table.add_row(['--------------','--------------','--------------','--------------','--------------'])
data_table.add_row(['Test Data' , test_data.shape , test_data.isna().sum().sum() ,test_data.shape[0] - test_data.isna().sum().sum()
, test_data.size])
print(data_table)

In [None]:
columns_description_table  = PrettyTable()
print('Note that`s main columns features that has been metioned in dataset')
columns_description_table.field_names = ['Column Name' , 'Description']
columns_description_table.add_row(['Sulfur dioxide (SO2)',
'Sulfur dioxide (SO2) enters the Earth\'s atmosphere through both natural and anthropogenic processes.\n It plays a role in chemistry on a local and global scale and its impact ranges from short-term pollution to effects on climate.\n Only about 30% of the emitted SO2 comes from natural sources; the majority is of anthropogenic origin.\n SO2 emissions adversely affect human health and air quality.\n SO2 has an effect on climate through radiative forcing, via the formation of sulfate aerosols.\n Volcanic SO2 emissions can also pose a threat to aviation, along with volcanic ash.\n S5P/TROPOMI samples the Earth\'s surface with a revisit time of one day with unprecedented spatial resolution of 3.5 x 7 km which allows the resolution of fine details including the detection of much smaller SO2 plumes.'])
columns_description_table.add_row(['---------------------------------------------------------------------------','---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'])
columns_description_table.add_row(['Carbon monoxide (CO)',
'Carbon monoxide (CO) is an important atmospheric trace gas for understanding tropospheric chemistry.\n In certain urban areas, it is a major atmospheric pollutant.\n Main sources of CO are combustion of fossil fuels, biomass burning, and atmospheric oxidation of methane and other hydrocarbons.\n Whereas fossil fuel combustion is the main source of CO at northern mid-latitudes, the oxidation of isoprene and biomass burning play an important role in the tropics.\n TROPOMI on the Sentinel 5 Precursor (S5P) satellite observes the CO global abundance exploiting clear-sky and cloudy-sky Earth radiance measurements in the 2.3 μm spectral range of the shortwave infrared (SWIR) part of the solar spectrum.\n TROPOMI clear sky observations provide CO total columns with sensitivity to the tropospheric boundary layer.\n For cloudy atmospheres, the column sensitivity changes according to the light path.'])
columns_description_table.add_row(['---------------------------------------------------------------------------','---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'])
columns_description_table.add_row(['Nitrogen oxides (NO2 and NO)',
'Nitrogen oxides (NO2 and NO) are important trace gases in the Earth\'s atmosphere, present in both the troposphere and the stratosphere.\n They enter the atmosphere as a result of anthropogenic activities (notably fossil fuel combustion and biomass burning) and natural processes (wildfires, lightning, and microbiological processes in soils).\n Here, NO2 is used to represent concentrations of collective nitrogen oxides because during daytime, i.e. in the presence of sunlight, a photochemical cycle involving ozone (O3) converts NO into NO2 and vice versa on a timescale of minutes.\n The TROPOMI NO2 processing system is based on the algorithm developments for the DOMINO-2 product and for the EU QA4ECV NO2 reprocessed dataset for OMI, and has been adapted for TROPOMI.\n This retrieval-assimilation-modelling system uses the 3-dimensional global TM5-MP chemistry transport model at a resolution of 1x1 degree as an essential element'])
columns_description_table.add_row(['---------------------------------------------------------------------------','---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'])
columns_description_table.add_row(['Formaldehyde',
'Formaldehyde is an intermediate gas in almost all oxidation chains of non-methane volatile organic compounds (NMVOC), leading eventually to CO2.\n Non-Methane Volatile Organic Compounds (NMVOCs) are, together with NOx, CO and CH4, among the most important precursors of tropospheric O3.\n The major HCHO source in the remote atmosphere is CH4 oxidation. Over the continents, the oxidation of higher NMVOCs emitted from vegetation, fires, traffic and industrial sources results in important and localized enhancements of the HCHO levels.\n The seasonal and inter-annual variations of the formaldehyde distribution are principally related to temperature changes and fire events, but also to changes in anthropogenic activities.\n HCHO concentrations in the boundary layer can be directly related to the release of short-lived hydrocarbons, which mostly cannot be observed directly from space.'])
columns_description_table.add_row(['---------------------------------------------------------------------------','---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'])
columns_description_table.add_row(['UV Aerosol Index (UVAI) , Absorbing Aerosol Index (AAI)',
'The AAI is based on wavelength-dependent changes in Rayleigh scattering in the UV spectral range for a pair of wavelengths.\n The difference between observed and modelled reflectance results in the AAI.\n When the AAI is positive, it indicates the presence of UV-absorbing aerosols like dust and smoke.\n It is useful for tracking the evolution of episodic aerosol plumes from dust outbreaks, volcanic ash, and biomass burning.The wavelengths used have very low ozone absorption, so unlike aerosol optical thickness measurements, AAI can be calculated in the presence of clouds.\n Daily global coverage is therefore possible.'])
columns_description_table.add_row(['---------------------------------------------------------------------------','---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'])
columns_description_table.add_row(['Ozone',
'In the stratosphere, the ozone layer shields the biosphere from dangerous solar ultraviolet radiation.\n In the troposphere, it acts as an efficient cleansing agent, but at high concentration it also becomes harmful to the health of humans, animals, and vegetation.\n Ozone is also an important greenhouse-gas contributor to ongoing climate change.\n Since the discovery of the Antarctic ozone hole in the 1980s and the subsequent Montreal Protocol regulating the production of chlorine-containing ozone-depleting substances, ozone has been routinely monitored from the ground and from space.\nFor this product, there are two algorithms that deliver total ozone: GDP for the near real-time and GODFIT for the offline products.\n GDP is currently being used for generating the operational total ozone products from GOME, SCIAMACHY and GOME-2; while GODFIT is being used in the ESA CCI and the Copernicus C3S projects.'])
columns_description_table.add_row(['---------------------------------------------------------------------------','---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------'])
columns_description_table.add_row(['Cloud',
'The TROPOMI/S5P cloud properties retrieval is based on the OCRA and ROCINN algorithms currently being used in the operational GOME and GOME-2 products.\n OCRA retrieves the cloud fraction using measurements in the UV/VIS spectral regions and ROCINN retrieves the cloud height (pressure) and optical thickness (albedo) using measurements in and around the oxygen A-band at 760 nm.\n Version 3.0 of the algorithms are used, which are based on a more realistic treatment of clouds as optically uniform layers of light-scattering particles.\n Additionally, the cloud parameters are also provided for a cloud model which assumes the cloud to be a Lambertian reflecting boundary.'])
print(columns_description_table)

# EDA

In [None]:
# Visualize data distribution
fig , axs  = plt.subplots(5,15, figsize = (300,100))
plt.suptitle(f'Data Numerical Distribution', ha = 'center' , fontweight = 'bold' ,fontsize = 100)
for i in range((5*15)-1):
  plt.subplot(5,15,i+1)
  sns.histplot(train_data.iloc[:,i+1] , color = 'green' , kde = True )
  sns.histplot(test_data.iloc[:,i+1] , color = 'orange' , kde = True )
plt.tight_layout()


In [None]:
""" 
# data correlation
columns = train_data.columns[1:]
for i in range(15):
  for j in range(15):
    plt.figure(figsize = (15,15))
    train_arr = np.triu(np.ones_like(train_data.corr().iloc[i*5:(i+1)*5,j*5:(j+1)*5], dtype=bool))
    sns.heatmap(train_data.corr().iloc[i*5:(i+1)*5,j*5:(j+1)*5] , mask  = train_arr, annot = True , cmap = 'hot' ,xticklabels = columns[i*5:(i+1)*5] ,
              yticklabels = columns[j*5:(j+1)*5] )
  plt.tight_layout()

"""

In [None]:
# relation of target with features
fig = plt.figure(figsize = (100,300))
for i in range(1,train_data.shape[1]-1):
  fig.add_subplot(15,5,i)
  sns.scatterplot(train_data , x  = train_data.iloc[:,i] , y = 'emission' , alpha = 0.2 ,color = 'green' )

In [None]:
# show different null
plt.suptitle('Number of Nulls and Not nulls in Data')
plt.subplot(1,2,1)
plt.title('Train Data')
plt.pie(x = [train_data.isna().sum().sum(),train_data.shape[0]*len(train_data.columns)] , labels = ['null' , 'Not_null'] , autopct = '%0.2f%%')
plt.subplot(1,2,2)
plt.title('Test Data')
plt.pie(x = [test_data.isna().sum().sum(),train_data.shape[0]*len(test_data.columns)] , labels = ['null' , 'Not_null'] , autopct = '%0.2f%%')

In [None]:
# pct of each element in train_data
columns_have_nulls  = []
fig = plt.figure(figsize = (200,100))
plt.suptitle('Different Feature with null values\n\n\n' , ha = 'center' , fontsize = 100 , fontweight = 'bold')
for i in range(len(train_data.columns)):
  if(train_data.isna().sum()[i]!= 0):columns_have_nulls.append(train_data.columns[i])

for i in range(len(columns_have_nulls)):
  fig.add_subplot(14,5,i+1)
  plt.pie([train_data.isna().sum()[i],train_data.shape[0]],autopct = '%0.9f%%' , labels = [columns_have_nulls[i],'Train Data'])
  plt.tight_layout()

In [None]:
# show outlier
fig  = plt.figure(figsize = (70,20))
plt.suptitle('Data Outliers\n\n\n' , ha = 'center' , fontsize = 100 , fontweight = 'bold')
for i in range(1,len(train_data.columns)):
  fig.add_subplot(5,15,i)
  sns.boxplot(train_data , x = train_data.columns[i],color = 'red')
  plt.tight_layout()
plt.subplots_adjust(
    wspace = 3
    ,hspace = 0.6
)

In [None]:
# split year
data_year  = train_data.groupby(['year'])
data_2019 = pd.DataFrame()
data_2020 = pd.DataFrame()
data_2021 = pd.DataFrame()
for n , g in data_year:
 if(n == 2019):data_2019 = g
 elif (n == 2020):data_2020 = g
 else : data_2021 = g
year_table = PrettyTable()
year_table.field_names = ['Year','Data Shape','N_nulls']
year_table.add_row(['2019',data_2019.shape ,data_2019.isna().sum().sum()])
year_table.add_row(['-----------------------------','-----------------------------','-----------------------------'])
year_table.add_row(['2020',data_2020.shape ,data_2020.isna().sum().sum()])
year_table.add_row(['-----------------------------','-----------------------------','-----------------------------'])
year_table.add_row(['2021',data_2021.shape ,data_2021.isna().sum().sum()])
print(year_table)

In [None]:
train_data['date'] = pd.to_datetime(train_data['year'].astype(str) + '-' + train_data['week_no'].astype(str) 
                                    + '-1', format='%Y-%W-%w')

In [None]:
train_p = train_data.copy(deep = True)
train_p.groupby(['date'])['emission'].sum().plot(kind='line', figsize=(20, 7), xlabel='Date')

plt.axvspan(pd.Timestamp('2020-01-01'), pd.Timestamp('2021-01-01'), color='red', alpha=0.1)
plt.axvline(pd.Timestamp('2020-01-01'), linestyle = "--", color='red')
plt.axvline(pd.Timestamp('2021-01-01'), linestyle = "--", color='red')

plt.text(pd.Timestamp ('2020-05-30'), 60000, "Virus", size = 20)

plt.title('Emission by date', size=15, pad=10)
plt.show()

In [None]:
# visualize null values
plt.suptitle('Different Data Year Null Values' ,ha = 'center' , fontsize = 20 , fontweight = 'bold')
plt.pie([data_2019.isna().sum().sum(),data_2020.isna().sum().sum(),data_2021.isna().sum().sum()],
        labels = ['2019','2020','2021'],autopct = '%0.2f%%'  )

In [None]:
# year with high emission
plt.suptitle('High Emission of CO2', fontsize = 25 , fontweight ='bold' , ha = 'center')
plt.bar(x = ['2019','2020','2021'] , height = [data_2019['emission'].max(),data_2020['emission'].max(),data_2021['emission'].max()],color = 'orange',
        label = 'Number of Emission')
plt.grid(True)
plt.legend()

# Feature Engineering & Data Preprocessing

In [None]:
# Treat null values
c_nulls = list(train_data.isna().sum())
columns_delete = []
for i in range(len(c_nulls)):
  if(0 < c_nulls[i] <= 20000):
     train_data.iloc[:,i].fillna(value = train_data.iloc[:,i].mean(),inplace = True)
     print(f'Filled with mean :{train_data.columns[i]}')
  elif(c_nulls[i] >20000):
    columns_delete.append(train_data.columns[i])
print(f'Need to delete :{columns_delete} \nlength : {len(columns_delete)}')

In [None]:
c_nulls_t = list(test_data.isna().sum())
columns_delete_t = []
for i in range(len(c_nulls_t)):
  if(0 < c_nulls_t[i] <= 20000):
     test_data.iloc[:,i].fillna(value = test_data.iloc[:,i].mean(),inplace = True)
     print(f'Filled with mean :{test_data.columns[i]}')
  elif(c_nulls_t[i] >20000):
    columns_delete_t.append(test_data.columns[i])
print(f'Need to delete :{columns_delete_t} \nlength : {len(columns_delete_t)}')

In [None]:
# delete null features
for i in range(len(columns_delete)):
  del train_data[columns_delete[i]]
  del test_data[columns_delete[i]]
print(f'Train Shape : {train_data.shape}')
print(f'Test Shape : {test_data.shape}')

In [None]:
# handle corona year
avg_emission_non_virus = train_data[train_data['year'].isin((2019,2021))].groupby('week_no')['emission'].mean()

avg_emission_virus = train_data[train_data['year'] == 2020].groupby('week_no')['emission'].mean()

ratios_for_weeks = avg_emission_non_virus/avg_emission_virus

train_data.loc[train_data['year'] == 2020, 'emission'] *= train_data['week_no'].map(ratios_for_weeks)

In [None]:
train_data.groupby(['date'])['emission'].sum().plot(kind='line', figsize=(20, 7), xlabel='Date')

plt.axvspan(pd.Timestamp('2020-01-01'), pd.Timestamp('2021-01-01'), color='green', alpha=0.1)
plt.axvline(pd.Timestamp('2020-01-01'), linestyle = "--", color='green')
plt.axvline(pd.Timestamp('2021-01-01'), linestyle = "--", color='green')

plt.text(pd.Timestamp('2020-05-22'), 62000, "No virus", size = 17)

plt.title('Emission by date', size=15, pad=10)
plt.show()

In [None]:
# Let's fix the large spike in the last week of 2020. It's an outlier.
train_data.loc[(train_data['week_no'] == 52) & (train_data['year'] == 2020), 'emission'] = np.power(train_data.loc[(train_data['week_no'] == 52) & (train_data['year'] == 2020), 'emission'], 1/1.5)

In [None]:
train_data['holidays'] = (train_data['week_no'].isin([0, 51, 12, 30]))
test_data['holidays'] = (test_data['week_no'].isin([0, 51, 12, 30]))

In [None]:
train_data['week_sin'] = np.sin(2 * np.pi * train_data['week_no']/53)
train_data['week_cos'] = np.cos(2 * np.pi * train_data['week_no']/53)

test_data['week_sin'] = np.sin(2 * np.pi * test_data['week_no']/53)
test_data['week_cos'] = np.cos(2 * np.pi * test_data['week_no']/53)

In [None]:
train_data['rot_15_x'] = (np.cos(np.radians(15)) * train_data['longitude']) + (np.sin(np.radians(15)) * train_data['latitude'])

train_data['rot_15_y'] = (np.cos(np.radians(15)) * train_data['latitude']) + (np.sin(np.radians(15)) * train_data['longitude'])

train_data['rot_30_x'] = (np.cos(np.radians(30)) * train_data['longitude']) + (np.sin(np.radians(30)) * train_data['latitude'])

train_data['rot_30_y'] = (np.cos(np.radians(30)) * train_data['latitude']) + (np.sin(np.radians(30)) * train_data['longitude'])

In [None]:
test_data['rot_15_x'] = (np.cos(np.radians(15)) * test_data['longitude']) + (np.sin(np.radians(15)) * test_data['latitude'])

test_data['rot_15_y'] = (np.cos(np.radians(15)) * test_data['latitude']) + (np.sin(np.radians(15)) * test_data['longitude'])

test_data['rot_30_x'] = (np.cos(np.radians(30)) * test_data['longitude']) + (np.sin(np.radians(30)) * test_data['latitude'])

test_data['rot_30_y'] = (np.cos(np.radians(30)) * test_data['latitude']) + (np.sin(np.radians(30)) * test_data['longitude'])

In [None]:
train_coords = train_data.drop_duplicates(subset = ['latitude', 'longitude'])
geometry = gpd.points_from_xy(train_coords['longitude'], train_coords['latitude'])
geo_df = gpd.GeoDataFrame(train_coords[["latitude", "longitude"]], geometry=geometry)

all_data_map = folium.Map(prefer_canvas=True)
geo_df_list = [[point.xy[1][0], point.xy[0][0]] for point in geo_df.geometry]

def rgba_to_hex(color):
    red, green, blue, alpha = color
    return f"#{int(red*255):02x}{int(green*255):02x}{int(blue*255):02x}"

temp = train_data.groupby(['latitude', 'longitude']).emission.mean().reset_index()
geometry = gpd.points_from_xy(temp['longitude'], temp['latitude'])

cmap = matplotlib.colormaps['coolwarm']
normalizer = matplotlib.colors.Normalize(vmin=np.log1p(temp['emission'].min()), vmax=np.log1p(temp['emission'].max()))

all_data_map = folium.Map(prefer_canvas=True)
geo_df_list = [[point.xy[1][0], point.xy[0][0]] for point in geometry]

for coordinates, emission in zip(geo_df_list, temp['emission']):
    all_data_map.add_child(
        folium.CircleMarker(
            location=coordinates,
            radius=1,
            weight=4,
            zoom=10,
            color=rgba_to_hex(cmap(normalizer(np.log1p(emission))))),
        )
all_data_map.fit_bounds(all_data_map.get_bounds())

rwanda_center = (-1.9607, 29.9707)
park_biega = (-1.8866, 28.4518) 
kirumba = (-0.5658, 29.1714) 
massif = (-2.9677, 28.6469)
lake = (-1.9277, 31.4346)
mbarara = (-0.692, 30.602)
muy = (-2.8374, 30.3346)

for color, coors in zip(
    ['red', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue'], 
    [rwanda_center, park_biega, kirumba, massif, lake, mbarara, muy]
):
    all_data_map.add_child(
        folium.features.CircleMarker(
            coors, 
            radius=8,
            color=color,
            fill_color=color
        )
    )

all_data_map

In [None]:
df_mean = train_data.groupby('ID_LAT_LON_YEAR_WEEK', as_index=False)['emission'].mean()
df_mean = df_mean.merge(train_data.drop_duplicates(subset=['ID_LAT_LON_YEAR_WEEK'])[['ID_LAT_LON_YEAR_WEEK', 'rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y']], on='ID_LAT_LON_YEAR_WEEK', how='left')
df_mean['log_em'] = np.log1p(df_mean['emission'])

fig = plt.figure(figsize=(15, 6))
for i, col in enumerate(['15', '30']):
    plt.subplot(1,2,i+1)
    plt.title(
        f'{col} degrees rotation', 
        size=20, 
        y=1.05, 
        fontname='Calibri', 
        color='#444444'
    )
    a = sns.scatterplot(
        data=df_mean, 
        x=f'rot_{col}_x', 
        y=f'rot_{col}_y', 
        hue='log_em', 
        palette='coolwarm', 
        s=12
    )
    plt.xticks(size=9)
    plt.yticks(size=9)
    plt.xlabel(f'rot_{col}_latitude', labelpad=7, fontsize=11)
    plt.ylabel(f'rot_{col}_longitude', labelpad=7, fontsize=11)

    for j in ['right', 'top']:
        a.spines[j].set_visible(False)
    a.get_legend().remove()
    
plt.show()

In [None]:
training_cols = ['latitude', 'longitude', 'year', 'week_sin', 'week_cos', 'holidays', 'rot_15_x', 'rot_15_y', 'rot_30_x', 'rot_30_y']

In [None]:
train_x = train_data.loc[:,training_cols]
train_y = train_data['emission']

In [None]:
# Normalizing data

train_x = MinMaxScaler().fit_transform(train_x)
x_test = MinMaxScaler().fit_transform(test_data.loc[:,training_cols])


train_x = pd.DataFrame(train_x ,columns =training_cols)
x_test = pd.DataFrame(x_test ,columns = training_cols)

In [None]:
train_x.shape

In [None]:
train = pd.concat((train_x,train_y),axis = 1)

In [None]:
x_test.shape

In [None]:
train.head()

# Build Models

In [None]:
rfr = RandomForestRegressor(n_estimators = 250 , max_depth = 100 , random_state = 32)

In [None]:
dtr = DecisionTreeRegressor(max_depth = 100 , random_state = 48)

In [None]:
xgb = XGBRegressor(n_estimators = 250 , max_depth = 100 , learning_rate = 0.001 , random_state = 26)

In [None]:
lgb = lgbm(learning_rate = 0.01 , random_state = 37)

# Train Models  & Evaluation

In [None]:
score_list = []
kf = LeaveOneGroupOut()
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, groups=train.year)):
    x_train = train.iloc[idx_tr,:-1]
    y_train = train.iloc[idx_tr]['emission']
    x_val = train.iloc[idx_va,:-1]
    y_val = train.iloc[idx_va]['emission']

    rfr.fit(x_train , y_train)
    y_va_pred = rfr.predict(x_val)
    rmse =np.sqrt(mean_squared_error(y_val, y_va_pred))
    print(f"Fold {fold} year {train.iloc[idx_va].year.iloc[0]}: rmse = {rmse:.2f}")
    score_list.append(rmse)

rmse = sum(score_list) / len(score_list)
print(f"Overall RMSE: {rmse:.2f}")

In [None]:
score_list = []
kf = LeaveOneGroupOut()
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, groups=train.year)):
    x_train = train.iloc[idx_tr,:-1]
    y_train = train.iloc[idx_tr]['emission']
    x_val = train.iloc[idx_va,:-1]
    y_val = train.iloc[idx_va]['emission']

    dtr.fit(x_train , y_train)
    y_va_pred = dtr.predict(x_val)
    rmse =np.sqrt(mean_squared_error(y_val, y_va_pred))
    print(f"Fold {fold} year {train.iloc[idx_va].year.iloc[0]}: rmse = {rmse:.2f}")
    score_list.append(rmse)

rmse = sum(score_list) / len(score_list)
print(f"Overall RMSE: {rmse:.2f}")

In [None]:
score_list = []
kf = LeaveOneGroupOut()
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, groups=train.year)):
    x_train = train.iloc[idx_tr,:-1]
    y_train = train.iloc[idx_tr]['emission']
    x_val = train.iloc[idx_va,:-1]
    y_val = train.iloc[idx_va]['emission']

    xgb.fit(x_train , y_train)
    y_va_pred = xgb.predict(x_val)
    rmse =np.sqrt(mean_squared_error(y_val, y_va_pred))
    print(f"Fold {fold} year {train.iloc[idx_va].year.iloc[0]}: rmse = {rmse:.2f}")
    score_list.append(rmse)

rmse = sum(score_list) / len(score_list)
print(f"Overall RMSE: {rmse:.2f}")

In [None]:
score_list = []
kf = LeaveOneGroupOut()
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, groups=train.year)):
    x_train = train.iloc[idx_tr,:-1]
    y_train = train.iloc[idx_tr]['emission']
    x_val = train.iloc[idx_va,:-1]
    y_val = train.iloc[idx_va]['emission']

    lgb.fit(x_train , y_train)
    y_va_pred = lgb.predict(x_val)
    rmse =np.sqrt(mean_squared_error(y_val, y_va_pred))
    print(f"Fold {fold} year {train.iloc[idx_va].year.iloc[0]}: rmse = {rmse:.2f}")
    score_list.append(rmse)

rmse = sum(score_list) / len(score_list)
print(f"Overall RMSE: {rmse:.2f}")

In [None]:
output = pd.read_csv('/kaggle/input/playground-series-s3e20/sample_submission.csv')

In [None]:
rfr.predict(x_test)

In [None]:
output['emission'] = rfr.predict(x_test)

In [None]:
output.to_csv('ouput.csv',index = False)