In [91]:
import pandas as pd

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"

df = pd.read_csv(url)
df.dtypes

Unnamed: 0,0
PassengerId,int64
Survived,int64
Pclass,int64
Name,object
Sex,object
Age,float64
SibSp,int64
Parch,int64
Ticket,object
Fare,float64


In [92]:
# Removendo as colunas irrelevantes

df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)
df.dtypes

Unnamed: 0,0
Survived,int64
Pclass,int64
Sex,object
Age,float64
SibSp,int64
Parch,int64
Fare,float64
Embarked,object


In [93]:
# Tratando valores ausentes

df["Age"] = df["Age"].fillna(df["Age"].median())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
df.dtypes

Unnamed: 0,0
Survived,int64
Pclass,int64
Sex,object
Age,float64
SibSp,int64
Parch,int64
Fare,float64
Embarked,object


In [94]:
# Transformando as colunas

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["Sex"] = le.fit_transform(df["Sex"])
df["Embarked"] = le.fit_transform(df["Embarked"])

In [95]:
# Definindo os features e os targets

X = df.drop("Survived", axis=1)
y = df["Survived"]

In [96]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [97]:
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [98]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

X

array([[ 0.82737724,  0.73769513, -0.56573646, ..., -0.47367361,
        -0.50244517,  0.58595414],
       [-1.56610693, -1.35557354,  0.66386103, ..., -0.47367361,
         0.78684529, -1.9423032 ],
       [ 0.82737724, -1.35557354, -0.25833709, ..., -0.47367361,
        -0.48885426,  0.58595414],
       ...,
       [ 0.82737724, -1.35557354, -0.1046374 , ...,  2.00893337,
        -0.17626324,  0.58595414],
       [-1.56610693,  0.73769513, -0.25833709, ..., -0.47367361,
        -0.04438104, -1.9423032 ],
       [ 0.82737724,  0.73769513,  0.20276197, ..., -0.47367361,
        -0.49237783, -0.67817453]])

In [99]:
# Dividindo os conjuntos de treinamento (80%) e teste (20%)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [100]:
# Treinando e testando com modelos ensemble

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
print("RandomForest:", accuracy_score(y_test, rf.predict(X_test)))

RandomForest: 0.8156424581005587


In [101]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
print("GradientBoosting:", accuracy_score(y_test, gb.predict(X_test)))

GradientBoosting: 0.8100558659217877


In [102]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# AdaBoost
ab = AdaBoostClassifier(random_state=42)
ab.fit(X_train, y_train)
print("AdaBoost:", accuracy_score(y_test, ab.predict(X_test)))

AdaBoost: 0.7988826815642458


In [103]:
# Testando com voting Classifiers

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

classificador1 = LogisticRegression(random_state=42, max_iter=1000)
classificador2 = RandomForestClassifier(random_state=42)
classificador3 = SVC(probability=True, random_state=42)

voting_hard = VotingClassifier(estimators=[('lr', classificador1), ('rf', classificador2), ('svc', classificador3)], voting='hard')
voting_hard.fit(X_train, y_train)
hard_acc = accuracy_score(y_test, voting_hard.predict(X_test))

voting_soft = VotingClassifier(estimators=[('lr', classificador1), ('rf', classificador2), ('svc', classificador3)], voting='soft')
voting_soft.fit(X_train, y_train)
soft_acc = accuracy_score(y_test, voting_soft.predict(X_test))

print('Voting Hard', hard_acc)
print('Voting Soft', soft_acc)

Voting Hard 0.8100558659217877
Voting Soft 0.8156424581005587


In [104]:
# Testando com Stacking Classifiers

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

base_learners = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('ab', AdaBoostClassifier(random_state=42))
]

meta_model = LogisticRegression(random_state=42, max_iter=1000)

stacking = StackingClassifier(estimators=base_learners, final_estimator=meta_model, cv=5)
stacking.fit(X_train, y_train)
stack_acc = accuracy_score(y_test, stacking.predict(X_test))

print('Stacking', stack_acc)

Stacking 0.8268156424581006
