In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor

from datetime import date

from xgboost import XGBRegressor

from google.colab import drive

# Data

In [2]:
drive.mount('/content/drive')

# Define file paths
folder_path = "/content/drive/MyDrive/PRAMA/"
train_path = folder_path + "Train.csv"
test_path = folder_path + "Test.csv"
submission_path = folder_path + "SampleSubmission.csv"
# Load datasets
data = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv(submission_path)


Mounted at /content/drive


In [3]:
data = pd.read_csv("Train.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Train.csv'

# Understand data

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
data.isnull().sum(axis=1)

In [None]:
sns.histplot(data['target'])

In [None]:
data.info()

# Change data (must run all)

In [None]:
data.drop(columns=["L3_CH4_CH4_column_volume_mixing_ratio_dry_air","L3_CH4_aerosol_height","L3_CH4_aerosol_optical_depth","L3_CH4_sensor_azimuth_angle","L3_CH4_sensor_zenith_angle","L3_CH4_solar_azimuth_angle","L3_CH4_solar_zenith_angle"], inplace=True)
test.drop(columns=["L3_CH4_CH4_column_volume_mixing_ratio_dry_air","L3_CH4_aerosol_height","L3_CH4_aerosol_optical_depth","L3_CH4_sensor_azimuth_angle","L3_CH4_sensor_zenith_angle","L3_CH4_solar_azimuth_angle","L3_CH4_solar_zenith_angle"], inplace=True)

In [None]:
data.drop(columns=["target_min","target_max","target_variance","target_count"], inplace=True)

In [None]:
num_filas_completas = data.dropna().shape[0]

print(f"Número de filas sin valores faltantes train: {num_filas_completas}")

data.dropna(inplace=True)

In [None]:
data["Date"] = pd.to_datetime(data["Date"])
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

print("Different values 'Year':", data["Year"].nunique())
print("Different values 'Month':", data["Month"].nunique())
print("Different values 'Day':", data["Day"].nunique())

data.drop(columns=["Date","Place_ID X Date", "Date","Year","Month","Day"], inplace=True)

In [None]:
count_categories = data['Place_ID'].value_counts()

print(count_categories)

#print("Different Places:", data["Place_ID"].unique())

In [None]:
place_id_to_mean_target = data.groupby('Place_ID')['target'].mean().to_dict()

In [None]:
target_mean = data.groupby('Place_ID')['target'].mean()

data['Place_ID_target_encoded'] = data['Place_ID'].map(target_mean)

data.drop(columns='Place_ID', inplace=True)

In [None]:
test.head()

In [None]:
data.head()


In [None]:
print("Different values 'place id target encoded':", data["Place_ID_target_encoded"].nunique())

In [None]:
print(data.columns)


# Correlation

In [None]:
corr = data.corr()
top_corr_features = corr.index

plt.figure(figsize=(40,40))
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")

# Feature selection

regression
el modelo eleje las variables
backward forward stepwise

random forest

random busting

In [None]:
var_y = 'target'
var_X_known = [col for col in data.columns if col != 'target']
var_X_new = [col for col in data.columns if col != 'Place_ID_target_encoded' and col != 'target']


In [None]:
#MODEL FOR KNOWN CITIES
X = data[var_X_known]
y = data[var_y]

X = sm.add_constant(X)

linreg_model = sm.OLS(y, X)
model_known = linreg_model.fit()

print("Results for model:")
print(model_known.summary())

In [None]:
#MODEL FOR NEW CITIES
X = data[var_X_new]
y = data[var_y]

X = sm.add_constant(X)

linreg_model = sm.OLS(y, X)
model_known = linreg_model.fit()

print("Results for model:")
print(model_known.summary())

In [None]:
def stepwise_selection(X, y,
                       initial_list=[],
                       threshold_in=0.01,
                       threshold_out=0.05,
                       verbose=True):
    included = list(initial_list)
    while True:
        changed = False

        # Forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included + [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'Agregado: {best_feature} con p-valor {best_pval:.6f}')

        # Backward step
        model = sm.OLS(y, sm.add_constant(X[included])).fit()
        pvalues = model.pvalues.iloc[1:]  # sin el intercept
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'Removido: {worst_feature} con p-valor {worst_pval:.6f}')

        if not changed:
            break

    return included

In [None]:
X_known = data[var_X_known]
y = data["target"]

selected_vars_known = stepwise_selection(X_known, y)

X_train_selected = X_known[selected_vars_known]


print("\nVariables seleccionadas:")
print(selected_vars_known)

In [None]:
X_new = data[var_X_new]
y = data["target"]

selected_vars_new = stepwise_selection(X_new, y)

print("\nVariables seleccionadas:")
print(selected_vars_new)

In [None]:
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train_selected, y)

In [None]:
mape_error_synt['Random forest'] = mape(Y_prediction8,Ytest)
rmse_error_synt['Random forest'] = rmse(Y_prediction8,Ytest)
print("RMSE={} \nMAPE={}".format(rmse(Y_prediction8,Ytest),mape(Y_prediction8,Ytest)))

# Submission

In [None]:
test['Place_ID_target_encoded'] = test['Place_ID'].map(place_id_to_mean_target)

In [None]:
known_test = test[test['Place_ID_target_encoded'].notna()].copy()
new_test = test[test['Place_ID_target_encoded'].isna()].copy()

In [None]:
#Predction for known cities
X_known_test = known_test[selected_vars_known.columns]
y_pred_known = model.predict(X_known_test)

In [None]:
X_test_selected = test[selected_vars_known]

In [None]:
y_pred_test = rf_model.predict(X_test_selected)

submission["target"] = y_pred_test
submission.to_csv("submission.csv", index=False)
