In [370]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [371]:
df = pd.read_csv("./data/datos_donantes.csv")

In [372]:
def modificar_method_pay(row):
    if row['donation_type'] in [2, 3, 4]:
        return 0
    else:
        return row['method_pay']

# Aplicar la función a la columna 'method_pay'
df['method_pay'] = df.apply(modificar_method_pay, axis=1)

In [373]:
df = df[df['donation_date'] < '2024-04-10']

In [374]:
df = df.groupby('donation_date').sum()

In [375]:
df.head()

Unnamed: 0_level_0,name,email,date,is_partner,company,role_company,donation_frecuency,suscription_status,donation_type,amount,method_pay
donation_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2006-11-29,Robin Wilkins,robin.wilkins@gmail.com,2006-11-29,0,0,0,1,1,1,9035.86,3
2006-12-02,Karen Howell,karen.howell@gmail.com,2010-04-01,0,0,0,0,0,1,4517.93,3
2006-12-20,Morgan Evans,morgan.evans@gmail.com,2012-07-22,0,0,0,0,0,1,9035.86,1
2006-12-23,Peter Peterson,peter.peterson@gmail.com,2006-12-23,0,0,0,1,1,1,1807.17,2
2006-12-29,Robin Wilkins,robin.wilkins@gmail.com,2006-11-29,0,0,0,1,1,1,9035.86,3


In [376]:
df['fecha'] = pd.to_datetime(df.index)

In [377]:
fecha_inicio = df.index.min()
fecha_fin = df.index.max()
rango_fechas = pd.date_range(start=fecha_inicio, end=fecha_fin, freq='D')

In [378]:
# Crear un DataFrame con todas las fechas posibles
serie_temporal = pd.DataFrame()
serie_temporal['fecha'] = rango_fechas

In [379]:
# Unir los datos de donaciones al DataFrame de fechas
serie_temporal = pd.merge(serie_temporal, df, on='fecha', how='left')

In [380]:
# Rellenar los valores faltantes con 0
serie_temporal['amount'].fillna(0, inplace=True)

In [381]:
serie_temporal['fecha'] = pd.to_datetime(serie_temporal['fecha'])
serie_temporal['año'] = serie_temporal['fecha'].dt.year
serie_temporal['mes'] = serie_temporal['fecha'].dt.month
serie_temporal['dia'] = serie_temporal['fecha'].dt.day

In [382]:
serie_temporal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6339 entries, 0 to 6338
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   fecha               6339 non-null   datetime64[ns]
 1   name                2921 non-null   object        
 2   email               2921 non-null   object        
 3   date                2921 non-null   object        
 4   is_partner          2921 non-null   float64       
 5   company             2921 non-null   object        
 6   role_company        2921 non-null   object        
 7   donation_frecuency  2921 non-null   float64       
 8   suscription_status  2921 non-null   float64       
 9   donation_type       2921 non-null   float64       
 10  amount              6339 non-null   float64       
 11  method_pay          2921 non-null   float64       
 12  año                 6339 non-null   int32         
 13  mes                 6339 non-null   int32       

In [383]:
serie_temporal = serie_temporal.drop(['name', 'fecha', 'email', 'date',  'company', 'role_company', 'donation_frecuency', 'is_partner',  'suscription_status',  'donation_type'], axis=1)

In [384]:
# Rellenar los valores faltantes con 0
serie_temporal.fillna(0, inplace=True)

In [385]:
# Dividir los datos en características (X) y la variable objetivo (y)
X = serie_temporal[['año', 'mes', 'dia', 'method_pay']]
y = serie_temporal['amount']

In [386]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [387]:
# from sklearn.ensemble import ExtraTreesRegressor
# from sklearn.model_selection import RandomizedSearchCV

# # Definir la cuadrícula de hiperparámetros
# param_grid = {
#     'n_estimators': [100, 150, 200],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt'],
#     'bootstrap': [True, False]
# }

# # Inicializar el ExtraTreesRegressor
# etr = ExtraTreesRegressor()

# # Inicializar RandomizedSearchCV
# random_search_etr = RandomizedSearchCV(estimator=etr, param_distributions=param_grid, n_iter=100, cv=5, verbose=1, random_state=42, n_jobs=-1)

# # Ajustar RandomizedSearchCV a los datos
# random_search_etr.fit(X_train, y_train)

 random_search_etr.best_params_ : {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}

In [388]:
model = ExtraTreesRegressor(n_estimators= 500, criterion= 'squared_error', random_state=24)

In [389]:
model.fit(X_train, y_train)

In [390]:
# Hacer predicciones en el conjunto de prueba
predictions = model.predict(X_test)

In [391]:
# Calcular el error cuadrático medio
mse = mean_squared_error(y_test, predictions)
print("RMSE:", np.sqrt(mse))
r2 = r2_score(y_test, predictions)
print("Coeficiente de determinación R^2:", r2)

RMSE: 1141.3723673227896
Coeficiente de determinación R^2: 0.8693893647184564
