## Modificacion de los datos

In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [38]:
df_usuarios = pd.read_csv('../data/raw/megaline_users.csv')
df_calls = pd.read_csv('../data/raw/megaline_calls.csv')
df_messages = pd.read_csv('../data/raw/megaline_messages.csv')
df_internet = pd.read_csv('../data/raw/megaline_internet.csv')
df_plans = pd.read_csv('../data/raw/megaline_plans.csv')

Manejo de formatos:

In [39]:
# Convertir reg_date y churn_date en df_usuarios a tipo datetime
df_usuarios['reg_date'] = pd.to_datetime(df_usuarios['reg_date'], errors='coerce')
df_usuarios['churn_date'] = pd.to_datetime(df_usuarios['churn_date'], errors='coerce')

# Convertir call_date en df_llamadas a tipo datetime
df_calls['call_date'] = pd.to_datetime(df_calls['call_date'], errors='coerce')

# Convertir message_date en df_mensajes a tipo datetime
df_messages['message_date'] = pd.to_datetime(df_messages['message_date'], errors='coerce')

# Convertir session_date en df_internet a tipo datetime
df_internet['session_date'] = pd.to_datetime(df_internet['session_date'], errors='coerce')

print("Tipos de datos después de conversión:\n")
print("USUARIOS:\n", df_usuarios.dtypes)
print("\nLLAMADAS:\n", df_calls.dtypes)
print("\nMENSAJES:\n", df_messages.dtypes)
print("\nINTERNET:\n", df_internet.dtypes)
print("\nPLANES:\n", df_plans.dtypes)

Tipos de datos después de conversión:

USUARIOS:
 user_id                int64
first_name            object
last_name             object
age                    int64
city                  object
reg_date      datetime64[ns]
plan                  object
churn_date    datetime64[ns]
dtype: object

LLAMADAS:
 id                   object
user_id               int64
call_date    datetime64[ns]
duration            float64
dtype: object

MENSAJES:
 id                      object
user_id                  int64
message_date    datetime64[ns]
dtype: object

INTERNET:
 id                      object
user_id                  int64
session_date    datetime64[ns]
mb_used                float64
dtype: object

PLANES:
 messages_included          int64
mb_per_month_included      int64
minutes_included           int64
usd_monthly_pay            int64
usd_per_gb                 int64
usd_per_message          float64
usd_per_minute           float64
plan_name                 object
dtype: object


In [None]:

# Duración total por usuario
agg_calls = df_calls.groupby('user_id')['duration'].sum().reset_index()
agg_calls.rename(columns={'duration': 'total_minutes'}, inplace=True)

# Número de mensajes por usuario
agg_msgs = df_messages.groupby('user_id').size().reset_index(name='total_messages')

# MB usados por usuario
agg_internet = df_internet.groupby('user_id')['mb_used'].sum().reset_index()
agg_internet.rename(columns={'mb_used': 'total_mb'}, inplace=True)

Unir:

In [None]:
df_merged = df_usuarios.copy()

# Unir llamadas
df_merged = pd.merge(df_merged, agg_calls, on='user_id', how='left')

# Unir mensajes
df_merged = pd.merge(df_merged, agg_msgs, on='user_id', how='left')

# Unir internet
df_merged = pd.merge(df_merged, agg_internet, on='user_id', how='left')

# Rellenar con 0 en caso de usuarios que no tengan registros de llamadas/mensajes/internet
df_merged['total_minutes'].fillna(0, inplace=True)
df_merged['total_messages'].fillna(0, inplace=True)
df_merged['total_mb'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged['total_minutes'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged['total_messages'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

In [None]:
# df_usuarios tiene la columna 'plan', que corresponde a 'plan_name' en df_planes
df_merged = pd.merge(df_merged, df_plans, left_on='plan', right_on='plan_name', how='left')

In [None]:
print("\nInformación final del DataFrame unificado:")
print(df_merged.info())


Información final del DataFrame unificado:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   user_id                500 non-null    int64         
 1   first_name             500 non-null    object        
 2   last_name              500 non-null    object        
 3   age                    500 non-null    int64         
 4   city                   500 non-null    object        
 5   reg_date               500 non-null    datetime64[ns]
 6   plan                   500 non-null    object        
 7   churn_date             34 non-null     datetime64[ns]
 8   total_minutes          500 non-null    float64       
 9   total_messages         500 non-null    float64       
 10  total_mb               500 non-null    float64       
 11  messages_included      500 non-null    int64         
 12  mb_per_month_include

In [44]:
display(df_merged.head())

Unnamed: 0,user_id,first_name,last_name,age,city,reg_date,plan,churn_date,total_minutes,total_messages,total_mb,messages_included,mb_per_month_included,minutes_included,usd_monthly_pay,usd_per_gb,usd_per_message,usd_per_minute,plan_name
0,1000,Anamaria,Bauer,45,"Atlanta-Sandy Springs-Roswell, GA MSA",2018-12-24,ultimate,NaT,116.83,11.0,1901.47,1000,30720,3000,70,7,0.01,0.01,ultimate
1,1001,Mickey,Wilkerson,28,"Seattle-Tacoma-Bellevue, WA MSA",2018-08-13,surf,NaT,1640.46,207.0,80437.94,50,15360,500,20,10,0.03,0.03,surf
2,1002,Carlee,Hoffman,36,"Las Vegas-Henderson-Paradise, NV MSA",2018-10-21,surf,NaT,777.13,88.0,40293.33,50,15360,500,20,10,0.03,0.03,surf
3,1003,Reynaldo,Jenkins,52,"Tulsa, OK MSA",2018-01-28,surf,NaT,1041.0,50.0,27044.14,50,15360,500,20,10,0.03,0.03,surf
4,1004,Leonila,Thompson,40,"Seattle-Tacoma-Bellevue, WA MSA",2018-05-23,surf,NaT,2618.95,177.0,156352.81,50,15360,500,20,10,0.03,0.03,surf


In [None]:
# Guarda el resultado como CSV
df_merged.to_csv('../data/processed/df_merged.csv', index=False)
print("\nArchivo guardado en 'data/clean/df_merged.csv'")


Archivo guardado en 'data/clean/df_merged.csv'
