In [276]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import ExtraTreesRegressor
from utils.funciones import BaseLineRegressor

In [277]:
df = pd.read_csv("./data/datos_donantes.csv")

In [278]:
def modificar_method_pay(row):
    if row['donation_type'] in [2, 3, 4]:
        return 0
    else:
        return row['method_pay']

# Aplicar la función a la columna 'method_pay'
df['method_pay'] = df.apply(modificar_method_pay, axis=1)

In [279]:
df = df[df['donation_date'] < '2024-04-10']

In [280]:
df = df.groupby('donation_date').agg({'amount': 'sum', 'is_partner': 'first', 'company': 'first', 'donation_frecuency': 'first','method_pay': 'first'})

In [281]:
df.head()

Unnamed: 0_level_0,amount,is_partner,company,donation_frecuency,method_pay
donation_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-11-29,9035.86,0,,1,3
2006-12-02,4517.93,0,,0,3
2006-12-20,9035.86,0,,0,1
2006-12-23,1807.17,0,,1,2
2006-12-29,9035.86,0,,1,3


In [282]:
df.shape

(2921, 5)

In [283]:
df['fecha'] = pd.to_datetime(df.index)

fecha_inicio = df.index.min()
fecha_fin = df.index.max()
rango_fechas = pd.date_range(start=fecha_inicio, end=fecha_fin, freq='D')

In [284]:
# Crear un DataFrame con todas las fechas posibles
serie_temporal = pd.DataFrame()
serie_temporal['fecha'] = rango_fechas

# Unir los datos de donaciones al DataFrame de fechas
df = pd.merge(serie_temporal, df, on='fecha', how='left')

In [285]:
df.shape # -> 365 * 17

(6339, 6)

In [286]:
# Rellenar los valores faltantes con 0 -> Target
df['amount'].fillna(0, inplace=True)

In [287]:
df.head(5)

Unnamed: 0,fecha,amount,is_partner,company,donation_frecuency,method_pay
0,2006-11-29,9035.86,0.0,,1.0,3.0
1,2006-11-30,0.0,,,,
2,2006-12-01,0.0,,,,
3,2006-12-02,4517.93,0.0,,0.0,3.0
4,2006-12-03,0.0,,,,


In [288]:
df['fecha'] = pd.to_datetime(df['fecha'])
df['año'] = df['fecha'].dt.year
df['mes'] = df['fecha'].dt.month
df['dia'] = df['fecha'].dt.day

In [289]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6339 entries, 0 to 6338
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   fecha               6339 non-null   datetime64[ns]
 1   amount              6339 non-null   float64       
 2   is_partner          2921 non-null   float64       
 3   company             446 non-null    object        
 4   donation_frecuency  2921 non-null   float64       
 5   method_pay          2921 non-null   float64       
 6   año                 6339 non-null   int32         
 7   mes                 6339 non-null   int32         
 8   dia                 6339 non-null   int32         
dtypes: datetime64[ns](1), float64(4), int32(3), object(1)
memory usage: 371.6+ KB


In [290]:
df.drop(columns=['company'], inplace=True)

In [291]:
df.method_pay.unique()

array([ 3., nan,  1.,  2.,  0.])

In [292]:
df.method_pay.fillna(0,inplace=True)
df.donation_frecuency.fillna(-1,inplace=True)
df.is_partner.fillna(-1,inplace=True)

In [293]:
df_0 = df[df['method_pay']  == 0]
df_1 = df[df['method_pay']  == 1]
df_2 = df[df['method_pay']  == 2]
df_3 = df[df['method_pay']  == 3]

In [294]:
df_1.head(2)

Unnamed: 0,fecha,amount,is_partner,donation_frecuency,method_pay,año,mes,dia
21,2006-12-20,9035.86,0.0,0.0,1.0,2006,12,20
49,2007-01-17,1807.17,0.0,0.0,1.0,2007,1,17


In [295]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 993 entries, 21 to 6331
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   fecha               993 non-null    datetime64[ns]
 1   amount              993 non-null    float64       
 2   is_partner          993 non-null    float64       
 3   donation_frecuency  993 non-null    float64       
 4   method_pay          993 non-null    float64       
 5   año                 993 non-null    int32         
 6   mes                 993 non-null    int32         
 7   dia                 993 non-null    int32         
dtypes: datetime64[ns](1), float64(4), int32(3)
memory usage: 58.2 KB


In [296]:
# Dividir los datos en características (X) y la variable objetivo (y)
X = df_1[['año', 'mes', 'dia', 'donation_frecuency', 'is_partner']]
y = df_1['amount']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [297]:
# Crear el modelo DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor(n_estimators=100, random_state=42)

In [298]:
model.fit(X_train, y_train)

In [299]:
# Hacer predicciones en el conjunto de prueba
predictions = model.predict(X_test)

In [300]:
# Calcular el error cuadrático medio
mse = mean_squared_error(y_test, predictions)
print("RMSE:", np.sqrt(mse))
r2 = r2_score(y_test, predictions)
print("Coeficiente de determinación R^2:", r2)

RMSE: 2257.7572278809894
Coeficiente de determinación R^2: 0.6334341203545186


*****

In [301]:
df_2.head(2)

Unnamed: 0,fecha,amount,is_partner,donation_frecuency,method_pay,año,mes,dia
24,2006-12-23,1807.17,0.0,1.0,2.0,2006,12,23
41,2007-01-09,9035.86,0.0,0.0,2.0,2007,1,9


In [302]:
# Dividir los datos en características (X) y la variable objetivo (y)
X = df_2[['año', 'mes', 'dia','donation_frecuency','is_partner']]
y = df_2['amount']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [303]:
# Crear el modelo ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor
model2 = ExtraTreesRegressor(n_estimators=100, random_state=42)

In [304]:
# Entrenar el modelo
model2.fit(X_train, y_train)

In [305]:
# Hacer predicciones en el conjunto de prueba
predictions = model2.predict(X_test)

In [306]:
# Calcular el error cuadrático medio
mse = mean_squared_error(y_test, predictions)
print("RMSE:", np.sqrt(mse))
r2 = r2_score(y_test, predictions)
print("Coeficiente de determinación R^2:", r2)

RMSE: 2366.844078307995
Coeficiente de determinación R^2: 0.6585326563577573


*****

In [307]:
# Dividir los datos en características (X) y la variable objetivo (y)
X = df_3[['año', 'mes', 'dia', 'donation_frecuency','is_partner']]
y = df_3['amount']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [308]:
# Crear el modelo ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor
model3 = ExtraTreesRegressor(n_estimators=100, random_state=42)

In [309]:
# Entrenar el modelo
model3.fit(X_train, y_train)

In [310]:
# Hacer predicciones en el conjunto de prueba
predictions = model3.predict(X_test)

In [311]:
# Calcular el error cuadrático medio
mse = mean_squared_error(y_test, predictions)
print("RMSE:", np.sqrt(mse))
r2 = r2_score(y_test, predictions)
print("Coeficiente de determinación R^2:", r2)

RMSE: 1969.5714093476824
Coeficiente de determinación R^2: 0.7064944105267139
