In [246]:
# !pip install optuna

In [247]:
# Importando bibliotecas
# import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

In [248]:
# Carregando dados de treino e teste
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [249]:
# Verificando estatísticas descritivas e valores nulos
train_statistics = train_data.describe()
print(train_data.info())
print("Valores nulos no conjunto de treino:\n", train_data.isnull().sum())
print("Valores nulos no conjunto de teste:\n", test_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
Valores nulos no conjunto de treino:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin      

In [250]:
# Criando funções para processamento de dados
def preprocess_data(data):
    # Mapeando valores categóricos para numéricos
    data['IsFemale'] = data['Sex'].map({'female': 1, 'male': 0})
    
    # Preenchendo valores nulos em 'Fare', 'Age' e 'Embarked'
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
    data['Age'] = data['Age'].fillna(data['Age'].mean())
    data['Embarked'] = data['Embarked'].fillna('S')
    
    # Mapeando valores categóricos de 'Embarked' para numéricos
    data['Port'] = data['Embarked'].map({'S': 1, 'C': 2, 'Q': 3})
    
    # Criando a feature 'Child'
    data['Child'] = np.where(data['Age'] < 12, 1, 0)

    # Tamanho da Família
    data['FamilySize'] = data['SibSp'] + data['Parch']

    # Titulo do passageiro
    # title_mapping = {
    #     "Mr": "Mr",
    #     "Miss": "Miss",
    #     "Mrs": "Mrs",
    #     "Master": "Master",
    #     "Dr": "Dr",
    #     "Rev": "Rev",
    #     "Col": "Col",
    #     "Major": "Major",
    #     "Mlle": "Miss",
    #     "Mme": "Mrs"
    # }
    # data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.')
    # data['Title'] = data['Title'].map(title_mapping)
    # data['Title'] = data['Title'].fillna('Other')

    # Comprimento do Nome
    data['NameLength'] = data['Name'].apply(len)

    # Grupos Etários
    bins = [0, 12, 18, 30, 50, 200]
    labels = ['Child', 'Teen', 'Young Adult', 'Adult', 'Senior']
    data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels)
    
    # Informação sobre Cabine
    data['CabinInfo'] = data['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

    return data

In [251]:
# Aplicando a função de preprocessamento nos dados de treino e teste
X_train = preprocess_data(train_data.drop(['PassengerId', 'Survived'], axis=1))
X_test = preprocess_data(test_data.drop(['PassengerId'], axis=1))

In [252]:
# Definindo as features categóricas e numéricas
categorical_features = ['AgeGroup']
numeric_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'IsFemale', 'Port', 'Child', 'FamilySize', 'NameLength', 'CabinInfo']
# numeric_features = ['CabinInfo', 'NameLength', 'IsFemale', 'Port', 'Child']

In [253]:
# Criando o ColumnTransformer com OneHotEncoder para features categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), numeric_features)
    ])

In [254]:
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IsFemale,Port,Child,FamilySize,NameLength,AgeGroup,CabinInfo
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,1,0,1,23,Young Adult,0
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,2,0,1,51,Adult,1
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1,0,0,22,Young Adult,0
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,1,0,1,44,Adult,1
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1,0,0,24,Adult,0


In [255]:
# Aplicando o ColumnTransformer aos dados de treino e teste
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# Target variable
y_train = train_data['Survived']

In [256]:
# Criando e treinando modelo de Random Forest
random_forest_model = RandomForestClassifier(criterion='entropy', n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=0)
cross_val_scores = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=10)

# Calculando e imprimindo a média dos scores de validação cruzada
print("Média dos scores de validação cruzada (Random Forest):", np.mean(cross_val_scores))

Média dos scores de validação cruzada (Random Forest): 0.8316729088639201


In [257]:
# Treinando o modelo de Random Forest para calcular a importância das features
random_forest_model.fit(X_train_scaled, y_train)

# Obtendo a importância das features do modelo
feature_importances = random_forest_model.feature_importances_

# Criando um DataFrame para mostrar as importâncias das features
feature_importance_df = pd.DataFrame({
    'Feature': numeric_features + list(preprocessor.transformers_[0][1].get_feature_names_out(categorical_features)),
    'Importance': feature_importances
})

# Ordenando as features por importância em ordem decrescente
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Mostrando as melhores features
print("Melhores features:")
print(feature_importance_df)

# Selecionando as top N melhores features (por exemplo, as 8 melhores features)
top_features = feature_importance_df.head(10)['Feature'].tolist()

print(top_features)

# Selecionando apenas as melhores features nos conjuntos de treino e teste
X_train_selected = X_train_scaled[:, feature_importance_df.index.isin(top_features)]
X_test_selected = X_test_scaled[:, feature_importance_df.index.isin(top_features)]

Melhores features:
                 Feature  Importance
10             CabinInfo    0.356867
9             NameLength    0.119559
14         AgeGroup_Teen    0.105844
5               IsFemale    0.093367
6                   Port    0.077845
15  AgeGroup_Young Adult    0.066974
13       AgeGroup_Senior    0.057837
7                  Child    0.030595
11        AgeGroup_Adult    0.023002
1                    Age    0.016609
8             FamilySize    0.014961
12        AgeGroup_Child    0.014657
0                 Pclass    0.008676
3                  Parch    0.004445
4                   Fare    0.004393
2                  SibSp    0.004368
['CabinInfo', 'NameLength', 'AgeGroup_Teen', 'IsFemale', 'Port', 'AgeGroup_Young Adult', 'AgeGroup_Senior', 'Child', 'AgeGroup_Adult', 'Age']


In [258]:
def objective(trial):
    # Define os espaços de busca para os parâmetros
    criterion = trial.suggest_categorical('criterion', ['entropy', 'gini'])
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

    # Cria o modelo com os parâmetros sugeridos
    model_rf_best = RandomForestClassifier(
        criterion=criterion,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=0,
        n_jobs=-1
    )

    # Avalia o modelo usando validação cruzada
    score = cross_val_score(model_rf_best, X_train_scaled, y_train, cv=10)
    mean_score = np.mean(score)

    return mean_score

# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
# study.optimize(objective, n_trials=30)

# Obtém os melhores parâmetros encontrados
# best_params = study.best_params
# print("Melhores parâmetros encontrados:", best_params)

In [259]:
# Criando e treinando modelo de Random Forest
random_forest_model_final = RandomForestClassifier(criterion='entropy', n_estimators=100, max_depth=5, min_samples_split=2, min_samples_leaf=1, random_state=0)

# Treinando o modelo final
random_forest_model_final.fit(X_train_scaled, y_train)

# Fazendo previsões nos dados de treino
y_train_pred = random_forest_model_final.predict(X_train_scaled)

# Calculando a matriz de confusão e pontuação nos dados de treino
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
print("Matriz de Confusão (Treino):\n", confusion_matrix_train)

train_score = random_forest_model_final.score(X_train_scaled, y_train)
print("Acurácia nos dados de treino:", train_score)

# Fazendo previsões nos dados de teste
y_test_pred = random_forest_model_final.predict(X_test_scaled)

# Criando DataFrame de submissão
submission = pd.DataFrame(test_data['PassengerId'])
submission['Survived'] = y_test_pred

# Salvando o arquivo de submissão
submission.to_csv('submission_6.csv', index=False)

Matriz de Confusão (Treino):
 [[523  26]
 [ 94 248]]
Acurácia nos dados de treino: 0.8653198653198653
