In [22]:
# Modulos y datos

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# Configuremos el acceso a la infraestructura de kaggle
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

# Definir un diccionario para almacenar el nombre del usuario y la llave de acceso
api_token_26feb2025 = {"username":"robintux","key":"ca9634802e098162a09e1006da3dbe94"}

import json
with open("/root/.kaggle/kaggle.json", "w") as file:
  json.dump(api_token_26feb2025, file)

# Asignemos unos permisos adecuados al archivo kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

# Dataset
!kaggle datasets download manishkc06/usa-census-income-data

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Dataset URL: https://www.kaggle.com/datasets/manishkc06/usa-census-income-data
License(s): unknown
Downloading usa-census-income-data.zip to /content
  0% 0.00/6.24M [00:00<?, ?B/s]
100% 6.24M/6.24M [00:00<00:00, 196MB/s]


In [23]:
# Descromprimimos el archivo descargado de kaggle
!unzip usa-census-income-data.zip

Archive:  usa-census-income-data.zip
  inflating: Training_set_census.csv  


In [24]:
# Cargamos el dataset en memoria ram
income = pd.read_csv("Training_set_census.csv")

In [25]:
income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 41 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   age                               200000 non-null  int64 
 1   class_of_worker                   200000 non-null  object
 2   industry_code                     200000 non-null  int64 
 3   occupation_code                   200000 non-null  int64 
 4   education                         200000 non-null  object
 5   wage_per_hour                     200000 non-null  int64 
 6   enrolled_in_edu_inst_lastwk       200000 non-null  object
 7   marital_status                    200000 non-null  object
 8   major_industry_code               200000 non-null  object
 9   major_occupation_code             200000 non-null  object
 10  race                              200000 non-null  object
 11  hispanic_origin                   199408 non-null  object
 12  se

In [26]:
# Valores faltantes
income.isnull().sum().sort_values(ascending = False)*100/income.shape[0]

Unnamed: 0,0
migration_msa,33.3365
migration_reg,33.3365
migration_within_reg,33.3365
migration_sunbelt,33.3365
country_father,2.261
country_mother,2.068
country_self,1.156
hispanic_origin,0.296
state_of_previous_residence,0.2385
num_person_Worked_employer,0.0


In [27]:
# Que hacer con los valores faltantes

  # Elimino todas las columnas con valores faltantes

  # Eliminar las columnas que tienen mas del 30% de valores faltantes y utilizar algunas estrategia para
  # rellenar los valores faltantes del resto de columnas



In [28]:
# Implementemos la primera estrategia : Elimino todas las columnas con valores faltantes
mv = income.isnull().sum().sort_values(ascending = False)*100/income.shape[0]
lista_columnas_eliminar1 = list(mv[mv != 0 ].index)
income_estra1 = income.drop(lista_columnas_eliminar1, axis = 1)
income_estra1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 32 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   age                               200000 non-null  int64 
 1   class_of_worker                   200000 non-null  object
 2   industry_code                     200000 non-null  int64 
 3   occupation_code                   200000 non-null  int64 
 4   education                         200000 non-null  object
 5   wage_per_hour                     200000 non-null  int64 
 6   enrolled_in_edu_inst_lastwk       200000 non-null  object
 7   marital_status                    200000 non-null  object
 8   major_industry_code               200000 non-null  object
 9   major_occupation_code             200000 non-null  object
 10  race                              200000 non-null  object
 11  sex                               200000 non-null  object
 12  me

In [29]:
# Implementemos la segunda estrategia para trabajar/procesar los valores faltantes
lista_columnas_eliminar2 = list(mv[mv > 30].index)
income_estra2 = income.drop(lista_columnas_eliminar2, axis = 1)
rellenar_na = income_estra2.isnull().sum().sort_values(ascending= False)
# income_estra2[list(rellenar_na[rellenar_na!= 0].index)].info()

for col in list(rellenar_na[rellenar_na!= 0].index):
  income_estra2[col] = income_estra2[col].fillna(value =income_estra2[col].mode()[0] )

income_estra2.isnull().sum().sum()




0

In [30]:
income_estra2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 37 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   age                               200000 non-null  int64 
 1   class_of_worker                   200000 non-null  object
 2   industry_code                     200000 non-null  int64 
 3   occupation_code                   200000 non-null  int64 
 4   education                         200000 non-null  object
 5   wage_per_hour                     200000 non-null  int64 
 6   enrolled_in_edu_inst_lastwk       200000 non-null  object
 7   marital_status                    200000 non-null  object
 8   major_industry_code               200000 non-null  object
 9   major_occupation_code             200000 non-null  object
 10  race                              200000 non-null  object
 11  hispanic_origin                   200000 non-null  object
 12  se

In [31]:
# Transformar la informacion de naturaleza cualitativa a informacion de naturaleza cuantitativa
cols_object = income_estra1.select_dtypes(["object"]).columns
list(cols_object)

['class_of_worker',
 'education',
 'enrolled_in_edu_inst_lastwk',
 'marital_status',
 'major_industry_code',
 'major_occupation_code',
 'race',
 'sex',
 'member_of_labor_union',
 'reason_for_unemployment',
 'full_parttime_employment_stat',
 'tax_filer_status',
 'region_of_previous_residence',
 'd_household_family_stat',
 'd_household_summary',
 'live_1_year_ago',
 'family_members_under_18',
 'citizenship',
 'fill_questionnaire_veteran_admin']

In [32]:
pd.get_dummies(income_estra1["class_of_worker"],dtype= "int")

Unnamed: 0,Federal government,Local government,Never worked,Not in universe,Private,Self-employed-incorporated,Self-employed-not incorporated,State government,Without pay,Federal government.1,Local government.1,Never worked.1,Not in universe.1,Private.1,Self-employed-incorporated.1,Self-employed-not incorporated.1,State government.1,Without pay.1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
199996,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
199997,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
199998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [33]:
# Iniciamos con un dataframe en blanco
data_cuali = pd.DataFrame()

# COnsideremos utilizar solo el 50% de los datos de income_estra1
income_estra1 = income_estra1.sample(frac = 0.5)


for col_obj in list(cols_object):
  data_cuali = pd.concat([data_cuali, pd.get_dummies(income_estra1[col_obj],dtype= "int")], axis = 1)

data_cuali.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 114188 to 4726
Columns: 341 entries,  Federal government to Yes
dtypes: int64(341)
memory usage: 260.9 MB


In [38]:
# de income_estra1 debo remover las variables de tipo object
income_estra1 = income_estra1.drop(list(cols_object), axis = 1)

# juntar/concatenar income_estra1 con data_cuali
income_estra1 = pd.concat([income_estra1, data_cuali], axis = 1)

income_estra1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 114188 to 4726
Columns: 354 entries, age to Yes
dtypes: int64(354)
memory usage: 270.8 MB


In [39]:
# Guardemos en disco duro income_estra1
income_estra1.to_csv("income_estra1.csv")

In [42]:
# Ya podemos construir un primer modelo regresional y un primer modelo
# de tipo arbol de decision

# Definamos las variables independientes y la variable dependiente (income_level)
y = income_estra1.income_level
X = income_estra1.drop(["income_level"], axis = 1)



In [44]:
# Primer Paso : Particionar los datos
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.15)

# Segundo Paso : Instanciamos las clases a modelar
model_base_reg = LogisticRegression(max_iter=10**4)
model_base_tree = DecisionTreeClassifier()

# Tercer Paso : Ajustamos los modelos
model_base_reg.fit(X_train, y_train)
model_base_tree.fit(X_train, y_train)

# Cuarto Paso : Etapa1
y_forecast_base_reg = model_base_reg.predict(X_test)
y_forecast_base_tree = model_base_tree.predict(X_test)

# Cuarto Paso : Etapa2
acc_base_reg = metrics.accuracy_score(y_test, y_forecast_base_reg)
acc_base_tree = metrics.accuracy_score(y_test, y_forecast_base_tree)

# Mostremos estos KPI calculados
print("""
  Modelo de regresion logistica : %f
  Modelo de Arbol de Decision : %f

""" %(acc_base_reg, acc_base_tree))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



  Modelo de regresion logistica : 0.944133
  Modelo de Arbol de Decision : 0.924933




In [None]:
# Analisis de la estabilidad del modelo : regresional

In [None]:
# Analisis de la estabilidad del modelo : Arbol de decision

In [None]:
# MOdificar y analizar el poder predicto de los arboles de decision considerando la
# profundidad


In [None]:
# Tambien construir modelos de regresion y de tipo arbol de decision con el
# dataset income_estra2

# Bosques Aleatorios

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
income_estra2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 37 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   age                               200000 non-null  int64 
 1   class_of_worker                   200000 non-null  object
 2   industry_code                     200000 non-null  int64 
 3   occupation_code                   200000 non-null  int64 
 4   education                         200000 non-null  object
 5   wage_per_hour                     200000 non-null  int64 
 6   enrolled_in_edu_inst_lastwk       200000 non-null  object
 7   marital_status                    200000 non-null  object
 8   major_industry_code               200000 non-null  object
 9   major_occupation_code             200000 non-null  object
 10  race                              200000 non-null  object
 11  hispanic_origin                   200000 non-null  object
 12  se

In [None]:
# Con este dataset construir un modelo de regresion logistica (base)

In [49]:
# Construyamos nuestro primer bosque aleatorio

# Primer paso : Particionamos los datos
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25)

# Segundo Paso : Instanceamos la clase RandomForestClassifier
model_base_RF = RandomForestClassifier()

# Tercer Paso : Ajuste del modelo
model_base_RF.fit(X_train, y_train)

# Cuarto Paso : Etapa1
y_forecast_base_RF = model_base_RF.predict(X_test)

# Cuarto Paso : Etapa2
acc_base_RF = metrics.accuracy_score(y_test, y_forecast_base_RF)
acc_base_RF



0.94484

In [None]:
# Analicemos el modelo de tipo bosque aleatorio
dir(model_base_RF)

In [53]:
model_base_RF.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [58]:
# Implementemos una funcion que nos permita instanciar la clase RandomForestClassifier modificando
# el numero de estimadores (numero de arbolitos)
# Tambien permitamos que la funcion modifique el porcentaje de datos de testeo

def RF_models(num_arboles, ts = 0.25):
  # Primer paso : Particionamos los datos
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = ts)

  # Segundo Paso : Instanceamos la clase RandomForestClassifier
  model_base_RF = RandomForestClassifier(n_estimators= num_arboles)

  # Tercer Paso : Ajuste del modelo
  model_base_RF.fit(X_train, y_train)

  # Cuarto Paso : Etapa1
  y_forecast_base_RF = model_base_RF.predict(X_test)

  # Cuarto Paso : Etapa2
  acc_base_RF = metrics.accuracy_score(y_test, y_forecast_base_RF)
  return acc_base_RF



In [56]:
RF_models(150)

0.94996

In [59]:
RF_models(250, 0.3)

0.9469

In [62]:
import sklearn.ensemble
dir(sklearn.ensemble)

['AdaBoostClassifier',
 'AdaBoostRegressor',
 'BaggingClassifier',
 'BaggingRegressor',
 'BaseEnsemble',
 'ExtraTreesClassifier',
 'ExtraTreesRegressor',
 'GradientBoostingClassifier',
 'GradientBoostingRegressor',
 'HistGradientBoostingClassifier',
 'HistGradientBoostingRegressor',
 'IsolationForest',
 'RandomForestClassifier',
 'RandomForestRegressor',
 'RandomTreesEmbedding',
 'StackingClassifier',
 'StackingRegressor',
 'VotingClassifier',
 'VotingRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_bagging',
 '_base',
 '_forest',
 '_gb',
 '_gradient_boosting',
 '_hist_gradient_boosting',
 '_iforest',
 '_stacking',
 '_voting',
 '_weight_boosting']

In [63]:
help(sklearn.ensemble.ExtraTreesClassifier)

Help on class ExtraTreesClassifier in module sklearn.ensemble._forest:

class ExtraTreesClassifier(ForestClassifier)
 |  ExtraTreesClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None)
 |  
 |  An extra-trees classifier.
 |  
 |  This class implements a meta estimator that fits a number of
 |  randomized decision trees (a.k.a. extra-trees) on various sub-samples
 |  of the dataset and uses averaging to improve the predictive accuracy
 |  and control over-fitting.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators : int, default=100
 |      The number of trees in the forest.
 |  
 |      .. versionchanged:: 0.22
 |     

In [64]:
# XgBoost
import xgboost

In [None]:
# Documentacion del modulo xgboost
help(xgboost)

In [None]:
#
dir(xgboost)

# Para problemas de clasificacion : XGBClassifier y XGBRFClassifier

In [67]:
help(xgboost.XGBClassifier)

Help on class XGBClassifier in module xgboost.sklearn:

class XGBClassifier(sklearn.base.ClassifierMixin, XGBModel)
 |  XGBClassifier(*, objective: Union[str, xgboost.sklearn._SklObjWProto, Callable[[Any, Any], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = 'binary:logistic', **kwargs: Any) -> None
 |  
 |  Implementation of the scikit-learn API for XGBoost classification.
 |  See :doc:`/python/sklearn_estimator` for more information.
 |  
 |  Parameters
 |  ----------
 |  
 |      n_estimators : Optional[int]
 |          Number of boosting rounds.
 |  
 |      max_depth :  typing.Optional[int]
 |  
 |          Maximum tree depth for base learners.
 |  
 |      max_leaves : typing.Optional[int]
 |  
 |          Maximum number of leaves; 0 indicates no limit.
 |  
 |      max_bin : typing.Optional[int]
 |  
 |          If using histogram-based algorithm, maximum number of bins per feature
 |  
 |      grow_policy : typing.Optional[str]
 |  
 |          Tree growing policy.
 |  
 |   