In [7]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import r2_score

In [41]:
eci_customer_data = pd.read_csv('data/eci_customer_data.csv')
eci_product_groups = pd.read_csv('data/eci_product_groups.csv')
eci_product_master = pd.read_csv('data/eci_product_master.csv')
eci_stores_clusters = pd.read_csv('data/eci_stores_clusters.csv')
eci_stores_data = pd.read_csv('data/eci_stores.csv')
eci_transactions = pd.read_csv('data/eci_transactions.csv')

Luego del Análisis Exploratorio de Datos (EDA.ipynb), hacemos ingeniería de atributos

### 1. eci_customer_data

In [42]:
# Unifico todos los valores de 'loyalty_member' del dataframe eci_customer_data a bool
mapping = {
    'Yes': True,
    'Y': True,
    'True': True,
    '1': True,
    'No': False,
    'N': False,
    'False': False,
    '0': False
}

eci_customer_data['loyalty_member'] = eci_customer_data['loyalty_member'].map(mapping)

In [43]:
# Dropeamos columnas irrelevantes
eci_customer_data.drop(columns=['email_address', 'phone_number'])

Unnamed: 0,client_id,city,state,zip_code,education_level,occupation,loyalty_member,loyalty_number,loyalty_points
0,500001,Spokane,WA,99201,High School,,False,,339
1,500002,Pittsburgh,PA,15222,High School,Sales Representative,True,LP525082,406
2,500003,Fort Collins,CO,80526,College Graduate,Analyst,True,950139,14
3,500004,Charleston,SC,29401,Some College,Supervisor,False,,
4,500005,Aurora,CO,80012,Associates,,True,,
...,...,...,...,...,...,...,...,...,...
801918,1521947,Akron,OH,44312,Bachelors,Analyst,True,894546,261
801919,1521948,Columbus,OH,43229,,Administrative Assistant,False,,
801920,1521949,Seattle,WA,98115,Some College,Assistant,True,LP134711,40941
801921,1521950,Norman,OK,73069,HS,Homemaker,True,LP892060,554


### 2. eci_product_groups

In [44]:
# Eliminamos columna price_group_name y aplicamos One-Hot Encoding a price_group_id

price_group_names = {
    'BACK_TO_SCHOOL_01': 'Back to School Essentials',
    'CASUAL_BOTTOMS_01': 'Casual Bottoms',
    'COMP_SETUP_01': 'Computer Setup Essentials',
    'FITNESS_COMBO_01': 'Complete Fitness Package', 
    'GIFT_UNDER_50_01': 'Gift Ideas Under $50',
    'MOBILE_DEVICES_01': 'Personal Computing Devices' ,
    'MOVIE_NIGHT_01': 'Movie Night Snacks',
    'WELLNESS_BUNDLE_01': 'Wellness & Self-Care'
}

eci_product_groups = eci_product_groups.drop(columns=['price_group_name'])

In [None]:
price_group_ids = pd.get_dummies(eci_product_groups['price_group_id'], prefix='price_group_id', dummy_na=True)

# pd.get_dummies(df, columns=['col_categorica'], drop_first=False)  # drop_first=True para evitar multicolinealidad

# Concatenar el DataFrame original con el nuevo DataFrame de variables codificadas
eci_product_groups = pd.concat([eci_product_groups, price_group_ids], axis=1)

# Dropear la columna original
eci_product_groups = eci_product_groups.drop(columns=['price_group_id'])

In [None]:
# Lista de columnas de One-Hot Encoding creadas previamente
encoded_columns = price_group_ids.columns

# Convertir a entero los bools
for col in encoded_columns:
    eci_product_groups[col] = eci_product_groups[col].astype(int)

In [50]:
# Aplicamos One-Hot Encoding a la columna group_type

group_types = pd.get_dummies(eci_product_groups['group_type'], prefix='group_type', dummy_na=True)

# pd.get_dummies(df, columns=['col_categorica'], drop_first=False)  # drop_first=True para evitar multicolinealidad

# Concatenar el DataFrame original con el nuevo DataFrame de variables codificadas
eci_product_groups = pd.concat([eci_product_groups, group_types], axis=1)

# Dropear la columna original
eci_product_groups = eci_product_groups.drop(columns=['group_type'])

In [51]:
encoded_columns = group_types.columns

for col in encoded_columns:
    eci_product_groups[col] = eci_product_groups[col].astype(int)

In [53]:
eci_product_groups.head()

Unnamed: 0,sku,product_name,price_group_id_BACK_TO_SCHOOL_01,price_group_id_CASUAL_BOTTOMS_01,price_group_id_COMP_SETUP_01,price_group_id_FITNESS_COMBO_01,price_group_id_GIFT_UNDER_50_01,price_group_id_MOBILE_DEVICES_01,price_group_id_MOVIE_NIGHT_01,price_group_id_WELLNESS_BUNDLE_01,price_group_id_nan,group_type_Bundle,group_type_Lifestyle,group_type_Promotional,group_type_Seasonal,group_type_Substitute,group_type_nan
0,BOOEDTE001,PageTurn Essential Textbooks,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,BOOEDTE002,Bookworm Advanced Textbooks,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,BOOEDTE004,PageTurn Essential Textbooks,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,CLOMESH002,DressRight Shirts Collection,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,CLOMESH005,StyleX Relaxed Shirts,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [None]:
# results = pd.DataFrame({'STORE_SUBGROUP_DATE_ID': submission['Unnamed: 0'], 'TOTAL_SALES:': prediccion})
# results.to_csv('submits/submit1.csv', index=False)