# Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer

# Load Dataset

In [None]:
traindf = pd.read_csv('normalised_df.csv')

In [None]:
X = traindf.drop('emission', axis=1)

In [None]:
y = traindf['emission']

# Split the data into training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 2)

# Random Forest Model (Regression)

In [None]:
RFR_model = RandomForestRegressor()

In [None]:
min_rmse = float('inf')
depth = 0
for i in range(6, 21, 1):
    RFR_model = RandomForestRegressor(max_depth= i)
    print("depth : "+ str(i))
    RFR_model.fit(X_train, y_train)
    y_pred = RFR_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(rmse)
    if(rmse < min_rmse):
        min_rmse = rmse
        depth = i

print(f"Min RMSE = {str(min_rmse)}, depth = {str(depth)}")


In [None]:
RFR_model = RandomForestRegressor(max_depth= 21, n_estimators= 250)
RFR_model.fit(X_train, y_train)
y_pred = RFR_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

# Catboost Regression

In [None]:
CBR_model = CatBoostRegressor()

In [78]:
CBR_model = CatBoostRegressor(
    iterations=5000,  # You can adjust the number of iterations
    early_stopping_rounds=50,  # Stop if performance doesn't improve for 50 rounds
    eval_metric='RMSE'  # Evaluation metric
)

# Train the model
CBR_model.fit(X_train, y_train, eval_set=(X_test, y_test),verbose= 100)

300:	learn: 59.1658146	test: 57.8013072	best: 57.8013072 (300)	total: 9.29s	remaining: 2m 25s


KeyboardInterrupt: 

In [None]:
y_pred_cbr = CBR_model.predict(X_test)

In [None]:
y_pred

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
rmse

# Test DataSet

In [None]:
features = traindf.columns

In [None]:
features = features.delete(-1)

In [None]:
features

In [None]:
testdf = pd.read_csv('test.csv', index_col= 'ID_LAT_LON_YEAR_WEEK')

In [None]:
testdf.shape

In [None]:
test = testdf[features]

In [None]:
test.shape

In [None]:
imputer = KNNImputer(n_neighbors=3)  # Choose the number of nearest neighbors
test_imputed_data = imputer.fit_transform(test)
test_imputed_df = pd.DataFrame(test_imputed_data, columns=test.columns)

In [None]:
test_imputed_df.head()

In [None]:
test_imputed_df['year'] = test_imputed_df['year'].astype('Int64')
test_imputed_df['week_no'] = test_imputed_df['week_no'].astype('Int64')

In [None]:
test_imputed_df.head()

In [None]:
test_imputed_df.to_csv('test_imputed_df.csv', index = True)

In [None]:
# import folium
# map_center = [test_imputed_df['latitude'].mean(), test_imputed_df['longitude'].mean()]
# m = folium.Map(location=map_center, zoom_start=10)

# # Add markers for each latitude and longitude
# for index, row in test_imputed_df.iterrows():
#     folium.Marker([row['latitude'], row['longitude']]).add_to(m)

# m  # Display the map

In [None]:
for col in test_imputed_df.columns:
    if col == "emission" or col == "latitude" or col == "longitude" or col == "year" or col == "week_no":
        print()
    else:
        print(col)
        test_imputed_df[col] = (test_imputed_df[col] - test_imputed_df[col].mean())/test_imputed_df[col].std()

In [None]:
test_imputed_df

In [None]:
y_predictions = RFR_model.predict(test_imputed_df)

In [None]:
y_predictions

In [None]:
submission = pd.DataFrame({'ID_LAT_LON_YEAR_WEEK': testdf.index, 'emission':y_predictions,})

In [None]:
submission

In [None]:
submission.to_csv('submission1.csv', index= False)