In [639]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor

In [640]:
df = pd.read_csv("./data/datos_donantes.csv")

In [641]:
def modificar_method_pay(row):
    if row['donation_type'] in [2, 3, 4]:
        return 0
    else:
        return row['method_pay']

# Aplicar la función a la columna 'method_pay'
df['method_pay'] = df.apply(modificar_method_pay, axis=1)

In [642]:
df = df[df['donation_date'] < '2024-04-10']

In [643]:
df.columns

Index(['name', 'email', 'date', 'is_partner', 'company', 'role_company',
       'donation_frecuency', 'suscription_status', 'donation_type', 'amount',
       'method_pay', 'donation_date'],
      dtype='object')

In [644]:
df.head()

Unnamed: 0,name,email,date,is_partner,company,role_company,donation_frecuency,suscription_status,donation_type,amount,method_pay,donation_date
0,Tara Young,tara.young@gmail.com,2017-10-22,1,Nike,CTO,1,1,4,0.0,0,2021-11-14
1,Bobby Acevedo,bobby.acevedo@gmail.com,2019-02-18,0,,,0,0,1,1355.38,3,2022-05-01
2,James Payne,james.payne@gmail.com,2013-03-06,0,,,0,0,1,8132.27,3,2014-10-18
3,Heather Mooney,heather.mooney@gmail.com,2013-06-02,1,Jonhsons,Employee,0,1,2,0.0,0,2018-06-11
4,Joseph Anderson,joseph.anderson@gmail.com,2024-01-21,1,The Oberoi,Employee,1,1,2,0.0,0,2024-01-30


In [645]:
df.shape

(3860, 12)

In [646]:
df = df.groupby('donation_date').agg({'amount': 'sum', 'is_partner': 'first', 'company': 'first', 'donation_frecuency': 'first','method_pay': 'first'})

In [647]:
df['fecha'] = pd.to_datetime(df.index)

In [648]:
fecha_inicio = df.index.min()
fecha_fin = df.index.max()
rango_fechas = pd.date_range(start=fecha_inicio, end=fecha_fin, freq='D')

In [649]:
serie_temporal.shape

(6339, 36)

In [650]:
# Crear un DataFrame con todas las fechas posibles
serie_temporal = pd.DataFrame()
serie_temporal['fecha'] = rango_fechas

In [651]:
# Unir los datos de donaciones al DataFrame de fechas
serie_temporal = pd.merge(serie_temporal, df, on='fecha', how='left')

In [652]:
serie_temporal.shape

(6339, 6)

In [653]:
# Rellenar los valores faltantes con 0
serie_temporal['amount'].fillna(0, inplace=True)

In [654]:
serie_temporal['fecha'] = pd.to_datetime(serie_temporal['fecha'])
serie_temporal['año'] = serie_temporal['fecha'].dt.year
serie_temporal['mes'] = serie_temporal['fecha'].dt.month
serie_temporal['dia'] = serie_temporal['fecha'].dt.day

In [655]:
serie_temporal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6339 entries, 0 to 6338
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   fecha               6339 non-null   datetime64[ns]
 1   amount              6339 non-null   float64       
 2   is_partner          2921 non-null   float64       
 3   company             446 non-null    object        
 4   donation_frecuency  2921 non-null   float64       
 5   method_pay          2921 non-null   float64       
 6   año                 6339 non-null   int32         
 7   mes                 6339 non-null   int32         
 8   dia                 6339 non-null   int32         
dtypes: datetime64[ns](1), float64(4), int32(3), object(1)
memory usage: 371.6+ KB


- Tendencia mensual: Puedes crear una característica para cada mes del año, donde cada característica indicaría si la observación ocurrió en ese mes o no. Por ejemplo, para el mes de enero, tendrías una columna que es 1 si la observación ocurrió en enero y 0 en caso contrario.

In [656]:
# for i in range(1, 13):
#     serie_temporal[f'mes_{i}'] = (serie_temporal['mes'] == i).astype(int)
    
for i in range(1, 32):
    serie_temporal[f'dia_{i}'] = (serie_temporal['dia'] == i).astype(int)

serie_temporal['años_transcurridos'] = serie_temporal['año'] - serie_temporal['año'].min()

In [657]:
serie_temporal.head(10)

Unnamed: 0,fecha,amount,is_partner,company,donation_frecuency,method_pay,año,mes,dia,dia_1,...,dia_23,dia_24,dia_25,dia_26,dia_27,dia_28,dia_29,dia_30,dia_31,años_transcurridos
0,2006-11-29,9035.86,0.0,,1.0,3.0,2006,11,29,0,...,0,0,0,0,0,0,1,0,0,0
1,2006-11-30,0.0,,,,,2006,11,30,0,...,0,0,0,0,0,0,0,1,0,0
2,2006-12-01,0.0,,,,,2006,12,1,1,...,0,0,0,0,0,0,0,0,0,0
3,2006-12-02,4517.93,0.0,,0.0,3.0,2006,12,2,0,...,0,0,0,0,0,0,0,0,0,0
4,2006-12-03,0.0,,,,,2006,12,3,0,...,0,0,0,0,0,0,0,0,0,0
5,2006-12-04,0.0,,,,,2006,12,4,0,...,0,0,0,0,0,0,0,0,0,0
6,2006-12-05,0.0,,,,,2006,12,5,0,...,0,0,0,0,0,0,0,0,0,0
7,2006-12-06,0.0,,,,,2006,12,6,0,...,0,0,0,0,0,0,0,0,0,0
8,2006-12-07,0.0,,,,,2006,12,7,0,...,0,0,0,0,0,0,0,0,0,0
9,2006-12-08,0.0,,,,,2006,12,8,0,...,0,0,0,0,0,0,0,0,0,0


In [658]:
serie_temporal = serie_temporal.drop(['fecha', 'company', 'method_pay', 'donation_frecuency', 'is_partner'], axis=1)

In [659]:
serie_temporal.columns

Index(['amount', 'año', 'mes', 'dia', 'dia_1', 'dia_2', 'dia_3', 'dia_4',
       'dia_5', 'dia_6', 'dia_7', 'dia_8', 'dia_9', 'dia_10', 'dia_11',
       'dia_12', 'dia_13', 'dia_14', 'dia_15', 'dia_16', 'dia_17', 'dia_18',
       'dia_19', 'dia_20', 'dia_21', 'dia_22', 'dia_23', 'dia_24', 'dia_25',
       'dia_26', 'dia_27', 'dia_28', 'dia_29', 'dia_30', 'dia_31',
       'años_transcurridos'],
      dtype='object')

In [660]:
# Dividir los datos en características (X) y la variable objetivo (y)
X = serie_temporal.drop(columns='amount')
y = serie_temporal['amount']

In [661]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 random_search_etr.best_params_ : {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}

In [662]:
model = ExtraTreesRegressor(n_estimators=300, random_state=43)
model.fit(X_train, y_train)

In [663]:
# Hacer predicciones en el conjunto de prueba
predictions = model.predict(X_test)

In [664]:
# Calcular el error cuadrático medio
mse = mean_squared_error(y_test, predictions)
print("RMSE:", np.sqrt(mse))
r2 = r2_score(y_test, predictions)
print("Coeficiente de determinación R^2:", r2)

RMSE: 1654.101427266724
Coeficiente de determinación R^2: 0.7256859156720645
