#**ChatGPT para Data Science (DS) e Machine Learning (ML)**
Será usado o ChatGPT para gerar código em Python para aplicarmos o Aprendizado de Máquina na base de dados conhecida como `Census.csv`. O objetivo é fazer a previsão da renda (income) se é menor ou igual, ou maior que U$ 50k/ano.

## Visualizar a base de dados

In [5]:
import pandas as pd

# Carregar o arquivo "census.csv", usando o Pandas
file_path = '/content/census.csv'
data = pd.read_csv(file_path)

# Visualizar o dataframe
data

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Transformar as colunas categóricas em numéricas - LabelEncoder

In [7]:
from sklearn.preprocessing import LabelEncoder

# Obter as colunas categóricas do DataFrame
categorical_cols = data.select_dtypes(include=['object']).columns

# Inicializar o LabelEncoder
label_encoder = LabelEncoder()

# Iterar sobre cada coluna categórica e aplicar o LabelEncoder
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Exibir as primeiras linhas do DataFrame após a transformação
print(data.head())

data

   age  workclass  final-weight  education  education-num  marital-status  \
0   39          7         77516          9             13               4   
1   50          6         83311          9             13               2   
2   38          4        215646         11              9               0   
3   53          4        234721          1              7               2   
4   28          4        338409          9             13               2   

   occupation  relationship  race  sex  capital-gain  capital-loos  \
0           1             1     4    1          2174             0   
1           4             0     4    1             0             0   
2           6             1     4    1             0             0   
3           6             0     2    1             0             0   
4          10             5     2    0             0             0   

   hour-per-week  native-country  income  
0             40              39       0  
1             13              

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,0,38,39,0
32557,40,4,154374,11,9,2,7,0,4,1,0,0,40,39,1
32558,58,4,151910,11,9,6,1,4,4,0,0,0,40,39,0
32559,22,4,201490,11,9,4,1,3,4,1,0,0,20,39,0


## Balancear os dados da classe (income) - SMOTE

In [9]:
# Considerando que 'data' é o DataFrame que contém os dados do arquivo 'census.csv'
unique_incomes = data['income'].value_counts()

print(unique_incomes)

0    24720
1     7841
Name: income, dtype: int64


In [10]:
from imblearn.over_sampling import SMOTE

# Separar os dados em atributos (X) e variável alvo (y)
X = data.drop('income', axis=1)
y = data['income']

# Inicializar o SMOTE
smote = SMOTE(random_state=42)

# Aplicar o SMOTE para gerar dados sintéticos
X_resampled, y_resampled = smote.fit_resample(X, y)

# Mostrar a contagem de classes após a aplicação do SMOTE
print(y_resampled.value_counts())

0    24720
1    24720
Name: income, dtype: int64


In [14]:
X_resampled

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49435,42,4,139001,14,10,2,5,0,3,1,0,0,50,39
49436,45,4,333899,9,13,2,10,2,4,0,0,0,40,39
49437,40,5,57233,9,10,2,4,0,4,1,0,0,47,39
49438,58,1,150390,9,13,4,2,0,4,0,0,0,40,39


In [42]:
y_resampled

0        0
1        0
2        0
3        0
4        0
        ..
49435    1
49436    1
49437    1
49438    1
49439    1
Name: income, Length: 49440, dtype: int64

## Tratar colunas categóricas - OneHotEncorder

In [40]:
# Selecione apenas as colunas categóricas para aplicar o One-Hot Encoding
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# Aplique o One-Hot Encoding usando get_dummies
data_encoded = pd.get_dummies(X_resampled, columns=categorical_columns)

# Agora, data_encoded contém as colunas categóricas convertidas em representação one-hot

In [41]:
data_encoded

Unnamed: 0,age,final-weight,education-num,capital-gain,capital-loos,hour-per-week,workclass_-2.7914713714064336,workclass_-2.0593365381809634,workclass_-1.3272017049554927,workclass_-0.5950668717300224,...,native-country_-0.6514095191287459,native-country_-0.5153833719772927,native-country_-0.3793572248258395,native-country_-0.2433310776743863,native-country_-0.10730493052293312,native-country_0.028721216628520087,native-country_0.1647473637799733,native-country_0.3007735109314265,native-country_0.4367996580828797,native-country_0.5728258052343329
0,-0.106552,-1.067722,1.035088,0.013732,-0.257101,-0.177589,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.779290,-1.012399,1.035088,-0.196484,-0.257101,-2.532570,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.187083,0.250965,-0.604169,-0.196484,-0.257101,-0.177589,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1.020883,0.433069,-1.423798,-0.196484,-0.257101,-0.177589,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.992394,1.422948,1.035088,-0.196484,-0.257101,-0.177589,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49435,0.135041,-0.480742,-0.194355,-0.196484,-0.257101,0.694627,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
49436,0.376634,1.379893,1.035088,-0.196484,-0.257101,-0.177589,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
49437,-0.026021,-1.261358,-0.194355,-0.196484,-0.257101,0.432962,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
49438,1.423539,-0.372015,1.035088,-0.196484,-0.257101,-0.177589,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


## Transformar a escala nas colunas numéricas - Padronização

In [43]:
from sklearn.preprocessing import StandardScaler

# Crie um objeto StandardScaler
scaler = StandardScaler()

# Selecione apenas as colunas numéricas para a padronização (excluindo a coluna alvo, se houver)
numeric_columns = data_encoded.select_dtypes(include=['number']).columns

# Aplique a padronização às colunas numéricas
data_encoded[numeric_columns] = scaler.fit_transform(data_encoded[numeric_columns])

In [44]:
data_encoded

Unnamed: 0,age,final-weight,education-num,capital-gain,capital-loos,hour-per-week,workclass_-2.7914713714064336,workclass_-2.0593365381809634,workclass_-1.3272017049554927,workclass_-0.5950668717300224,...,native-country_-0.6514095191287459,native-country_-0.5153833719772927,native-country_-0.3793572248258395,native-country_-0.2433310776743863,native-country_-0.10730493052293312,native-country_0.028721216628520087,native-country_0.1647473637799733,native-country_0.3007735109314265,native-country_0.4367996580828797,native-country_0.5728258052343329
0,-0.106552,-1.067722,1.035088,0.013732,-0.257101,-0.177589,-0.209779,-0.192974,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,0.373337,-0.038978,-0.017992
1,0.779290,-1.012399,1.035088,-0.196484,-0.257101,-2.532570,-0.209779,-0.192974,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,0.373337,-0.038978,-0.017992
2,-0.187083,0.250965,-0.604169,-0.196484,-0.257101,-0.177589,-0.209779,-0.192974,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,0.373337,-0.038978,-0.017992
3,1.020883,0.433069,-1.423798,-0.196484,-0.257101,-0.177589,-0.209779,-0.192974,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,0.373337,-0.038978,-0.017992
4,-0.992394,1.422948,1.035088,-0.196484,-0.257101,-0.177589,-0.209779,-0.192974,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,-2.678545,-0.038978,-0.017992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49435,0.135041,-0.480742,-0.194355,-0.196484,-0.257101,0.694627,-0.209779,-0.192974,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,0.373337,-0.038978,-0.017992
49436,0.376634,1.379893,1.035088,-0.196484,-0.257101,-0.177589,-0.209779,-0.192974,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,0.373337,-0.038978,-0.017992
49437,-0.026021,-1.261358,-0.194355,-0.196484,-0.257101,0.432962,-0.209779,-0.192974,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,0.373337,-0.038978,-0.017992
49438,1.423539,-0.372015,1.035088,-0.196484,-0.257101,-0.177589,-0.209779,5.182038,-0.286074,-0.183174,...,-0.056801,-0.069989,-0.047436,-0.067766,-0.067916,-0.064369,-0.063892,0.373337,-0.038978,-0.017992


## Treinar e Testar o modelo final

In [54]:
from sklearn.model_selection import train_test_split

# Supondo que X_resampled contém os atributos e y_resampled contém a variável alvo (classe "income")
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Exibindo o tamanho dos conjuntos de treinamento e teste
print("Tamanho do conjunto de treinamento:", len(X_train))
print("Tamanho do conjunto de teste:", len(X_test))

Tamanho do conjunto de treinamento: 34608
Tamanho do conjunto de teste: 14832


In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Lista de classificadores
classifiers = [
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Neural Network', MLPClassifier(max_iter=1000)),  # Aumentando o número máximo de iterações para o MLPClassifier
    ('AdaBoost', AdaBoostClassifier()),
    ('XGBoost', XGBClassifier())
]

# Dicionários para armazenar as métricas de avaliação de cada algoritmo
metrics = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1': f1_score
}

# Dicionários para armazenar os resultados de cada métrica por classificador
results = {metric: {clf_name: [] for clf_name, _ in classifiers} for metric in metrics}

# Iteração sobre os classificadores
for clf_name, clf in classifiers:
    # Treinamento do modelo
    clf.fit(X_train, y_train)

    # Predição nos dados de teste
    y_pred = clf.predict(X_test)

    # Cálculo das métricas de avaliação
    for metric_name, metric_func in metrics.items():
        results[metric_name][clf_name] = metric_func(y_test, y_pred)

# Apresentação dos resultados
for metric_name, result in results.items():
    print(f"------ {metric_name} ------")
    for clf_name, value in result.items():
        print(f"{clf_name}: {value}")
    print("\n")


------ Accuracy ------
Logistic Regression: 0.8596278317152104
Support Vector Machine: 0.8718985976267529
Decision Tree: 0.8511326860841424
Random Forest: 0.8887540453074434
K-Nearest Neighbors: 0.8400755124056095
Naive Bayes: 0.8118257820927723
Gradient Boosting: 0.8845738942826321
Neural Network: 0.8663025889967637
AdaBoost: 0.8700782092772384
XGBoost: 0.9009573894282632


------ Precision ------
Logistic Regression: 0.8623394185260311
Support Vector Machine: 0.8743413052290231
Decision Tree: 0.8507842874379944
Random Forest: 0.893356880858579
K-Nearest Neighbors: 0.8436610169491525
Naive Bayes: 0.8667192429022083
Gradient Boosting: 0.8969100734377164
Neural Network: 0.881145251396648
AdaBoost: 0.8766393442622951
XGBoost: 0.9120894286502899


------ Recall ------
Logistic Regression: 0.8570084666039511
Support Vector Machine: 0.8696411772611208
Decision Tree: 0.8528423598978632
Random Forest: 0.8837521838462572
K-Nearest Neighbors: 0.8361779330735116
Naive Bayes: 0.7384760112888052
G

## Gerar o modelo final


In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

# Divisão dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Treinamento do modelo Random Forest com os melhores parâmetros
best_rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)  # Coloque os melhores parâmetros aqui

best_rf.fit(X_train, y_train)

# Salvando o modelo treinado no disco
joblib.dump(best_rf, 'random_forest_model.pkl')


['random_forest_model.pkl']

## Carregar e Simular o uso do modelo final

### Usando o modelo com dados do X_teste

In [59]:
# Carregando o modelo salvo
loaded_rf = joblib.load('random_forest_model.pkl')

# Dados para fazer a previsão (substitua isso pelos dados que você deseja prever)
new_data = X_test.iloc[0].values.reshape(1, -1)  # Exemplo com a primeira linha dos dados de teste

# Fazendo a previsão com o modelo carregado
prediction = loaded_rf.predict(new_data)

# Imprimindo a previsão
print(f'A previsão para os dados fornecidos é: {prediction}')


A previsão para os dados fornecidos é: [0]




### Usando o modelo com dados sugeridos - new_data

In [61]:
from sklearn.ensemble import RandomForestClassifier
import joblib

# Suponhamos que você já tenha treinado e salvo o modelo como 'random_forest_model.pkl'
# Carregar o modelo previamente treinado
loaded_rf = joblib.load('random_forest_model.pkl')

# Dados para fazer a previsão (substitua isso pelos dados que você deseja prever)
# Exemplo de dados para previsão (devem ter o mesmo formato que os dados de treinamento)
# Dados para fazer a previsão
new_data = [[30, 1, 2, 0, 150000, 1, 0, 0, 0, 0, 40, 0, 1, 0]]


# Fazendo a previsão com o modelo carregado
prediction = loaded_rf.predict(new_data)

# Imprimindo a previsão
if prediction == 1:
    print("O modelo prevê que a pessoa terá um alto rendimento.")
else:
    print("O modelo prevê que a pessoa terá um baixo rendimento.")


O modelo prevê que a pessoa terá um alto rendimento.


