In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.svm import SVC
import joblib


In [2]:
#leemos los datos
data = pd.read_csv("../dataset/data_evaluacion.csv")
data.head(5)

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [3]:
#Asiganmos nombres a las columnas sin etiquetas
data.columns = [
    'edad',                # age
    'clase_laboral',       # workclass
    'fnlwgt',              # continuous (final weight)
    'educacion',           # education
    'anios_educacion_dedicados',       # continuous (education-num)
    'estado_civil',        # marital-status
    'ocupacion',           # occupation
    'relacion',            # relationship
    'raza',                # race
    'sexo',                # sex
    'ganancia_capital',    # continuous (capital-gain)
    'perdida_capital',     # continuous (capital-loss)
    'horas_por_semana',    # continuous (hours-per-week)
    'pais_origen',         # native-country
    'ingreso'              # income
]
data.head(5)

Unnamed: 0,edad,clase_laboral,fnlwgt,educacion,anios_educacion_dedicados,estado_civil,ocupacion,relacion,raza,sexo,ganancia_capital,perdida_capital,horas_por_semana,pais_origen,ingreso
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [4]:
#verificamos la cantidad de filas y columnas

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48841 entries, 0 to 48840
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   edad                       48841 non-null  int64 
 1   clase_laboral              48841 non-null  object
 2   fnlwgt                     48841 non-null  int64 
 3   educacion                  48841 non-null  object
 4   anios_educacion_dedicados  48841 non-null  int64 
 5   estado_civil               48841 non-null  object
 6   ocupacion                  48841 non-null  object
 7   relacion                   48841 non-null  object
 8   raza                       48841 non-null  object
 9   sexo                       48841 non-null  object
 10  ganancia_capital           48841 non-null  int64 
 11  perdida_capital            48841 non-null  int64 
 12  horas_por_semana           48841 non-null  int64 
 13  pais_origen                48841 non-null  object
 14  ingres

In [5]:
#verificamos datos nulos
data.isnull().sum()

edad                         0
clase_laboral                0
fnlwgt                       0
educacion                    0
anios_educacion_dedicados    0
estado_civil                 0
ocupacion                    0
relacion                     0
raza                         0
sexo                         0
ganancia_capital             0
perdida_capital              0
horas_por_semana             0
pais_origen                  0
ingreso                      0
dtype: int64

In [6]:
#Mínimos y máximos de categorías numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
edad,48841.0,38.643578,13.71065,17.0,28.0,37.0,48.0,90.0
fnlwgt,48841.0,189666.430786,105603.887256,12285.0,117555.0,178147.0,237646.0,1490400.0
anios_educacion_dedicados,48841.0,10.078029,2.570965,1.0,9.0,10.0,12.0,16.0
ganancia_capital,48841.0,1079.045208,7452.0937,0.0,0.0,0.0,0.0,99999.0
perdida_capital,48841.0,87.504105,403.008483,0.0,0.0,0.0,0.0,4356.0
horas_por_semana,48841.0,40.422391,12.391571,1.0,40.0,40.0,45.0,99.0


In [7]:
data.head(5)

Unnamed: 0,edad,clase_laboral,fnlwgt,educacion,anios_educacion_dedicados,estado_civil,ocupacion,relacion,raza,sexo,ganancia_capital,perdida_capital,horas_por_semana,pais_origen,ingreso
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [8]:
categorical = ['clase_laboral','educacion','estado_civil','ocupacion','relacion','raza','sexo','pais_origen']
label_encoder = LabelEncoder()
for col in categorical:
    label_encoder.fit(data[col])
    data[col] = label_encoder.transform(data[col])
    

data.head(5) 

Unnamed: 0,edad,clase_laboral,fnlwgt,educacion,anios_educacion_dedicados,estado_civil,ocupacion,relacion,raza,sexo,ganancia_capital,perdida_capital,horas_por_semana,pais_origen,ingreso
0,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,<=50K
1,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,<=50K
2,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,<=50K
3,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,<=50K
4,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,<=50K


In [9]:
#Caracteristicas y variable objetivo
x = data.drop(columns=['ingreso'], axis=1)
y = data['ingreso']


In [None]:
joblib.dump(x.columns, 'x_columns.pkl')

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [11]:
scaler = StandardScaler()

x_train_scaled = pd.DataFrame(scaler.fit_transform(x_train), columns = x.columns)
x_test_scaled = pd.DataFrame(scaler.transform(x_test), columns = x.columns)

In [None]:
svc_linear = SVC(kernel='linear', random_state=1)
svc_linear.fit(x_train, y_train)
y_pred_linear = svc_linear.predict(x_test)
print("Linear")
print('Train Accuracy : %.5f' % svc_linear.score(x_train_scaled, y_train))
print('Test Accuracy : %.5f' % svc_linear.score(x_test_scaled, y_test))
print("Precisión del modelo:", accuracy_score(y_test, y_pred_linear))
print(classification_report(y_test,y_pred_linear))


In [None]:
svc_rbf = SVC(kernel='rbf', random_state=1)
svc_rbf.fit(x_train, y_train)
y_pred_rbf = svc_rbf.predict(x_test)
print("RBF ")
print('Train Accuracy : %.5f' % svc_rbf.score(x_train_scaled, y_train))
print('Test Accuracy : %.5f' % svc_rbf.score(x_test_scaled, y_test))
print("Precisión del modelo:", accuracy_score(y_test, y_pred_rbf))
print(classification_report(y_test,y_pred_rbf))

In [None]:
svc_poly = SVC(kernel='poly', random_state=1)
svc_poly.fit(x_train, y_train)
y_pred_poly = svc_poly.predict(x_test)
print("Polynomial")
print('Train Accuracy : %.5f' % svc_poly.score(x_train_scaled, y_train))
print('Test Accuracy : %.5f' % svc_poly.score(x_test_scaled, y_test))
print("Precisión del modelo:", accuracy_score(y_test, y_pred_poly))
print(classification_report(y_test,y_pred_poly))


In [None]:
svc_sigmoid = SVC(kernel='sigmoid', random_state=1)
svc_sigmoid.fit(x_train, y_train)
y_pred_sigmoid = svc_sigmoid.predict(x_test)
print("Sigmoid ")
print('Train Accuracy : %.5f' % svc_sigmoid.score(x_train_scaled, y_train))
print('Test Accuracy : %.5f' % svc_sigmoid.score(x_test_scaled, y_test))
print("Precisión del modelo:", accuracy_score(y_test, y_pred_sigmoid))
print(classification_report(y_test,y_pred_sigmoid))


Aditimulye. (2021, July 4). Adult Income Dataset | From scratch. https://www.kaggle.com/code/aditimulye/adult-income-dataset-from-scratch