# Titanic Disaster - Classification

In [86]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

pd.set_option('future.no_silent_downcasting', True)


In [88]:
#%% abrir o datase de treino e teste

train = pd.read_csv('./train.csv')
test  = pd.read_csv('./test.csv')


In [89]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [90]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [17]:
def oheInAttr(X, nameAttr):
    encoder = OneHotEncoder(sparse_output=False)
    X_Pclass = X[[nameAttr]]
    # Aplica o encoder e cria um dataframe com as colunas resultantes
    encoded_pclass = pd.DataFrame(encoder.fit_transform(X_Pclass), 
                                  columns=encoder.get_feature_names_out([nameAttr]))
    # Junta as colunas encodadas ao dataframe original
    X = pd.concat([X, encoded_pclass], axis=1)
    # Opcional: Remova a coluna original 'Pclass'
    X.drop(nameAttr, axis=1, inplace=True)
    return X

In [91]:
# Implementando o cross validation

# data and target
data, target = train.drop(columns = ['PassengerId', 'Survived']), train['Survived']
data_test = test.drop(columns = ['PassengerId'])

In [93]:
X_train = data.copy()
y_test = data_test.copy()


def features_create(X):    
    X['Age'] = X['Age'].fillna(X['Age'].mean())
    
    subs = {'female':1, 'male':0}
    X['mulher'] = X['Sex'].replace(subs)
    
    X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])
    
    subs = {'S':1, 'C':2, 'Q':3}
    X['porto'] = X['Embarked'].replace(subs)
    
    X['crianca'] = 1
    X['crianca'] = np.where(X['Age'] < 12, 1, 0)
    
    X['caroOrBarato'] = 1
    X['caroOrBarato'] = np.where(X['Fare'] > 50, 1, 0)

    
    # X = oheInAttr(X, "Sex")
    # X = oheInAttr(X, "Embarked")
    X = oheInAttr(X, "Pclass")
    
    
    featureDrop = ["Name", "Ticket", "Cabin", 'Embarked', "Sex"]
    X = X.drop(columns = featureDrop, axis=0)
    
    return X


X_train = features_create(X_train)
y_test  = features_create(y_test)

# Remove a coluna Fare
X_train = X_train.drop(columns = ['Fare'], axis=0)
y_test = y_test.drop(columns = ['Fare'], axis=0)


In [94]:
print(X_train.info())

print()

#verificar valores nulos ou NAN
print(X_train.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           891 non-null    float64
 1   SibSp         891 non-null    int64  
 2   Parch         891 non-null    int64  
 3   mulher        891 non-null    object 
 4   porto         891 non-null    object 
 5   crianca       891 non-null    int64  
 6   caroOrBarato  891 non-null    int64  
 7   Pclass_1      891 non-null    float64
 8   Pclass_2      891 non-null    float64
 9   Pclass_3      891 non-null    float64
dtypes: float64(4), int64(4), object(2)
memory usage: 69.7+ KB
None

Age             0
SibSp           0
Parch           0
mulher          0
porto           0
crianca         0
caroOrBarato    0
Pclass_1        0
Pclass_2        0
Pclass_3        0
dtype: int64


In [100]:
print(y_test.info())

print()

#verificar valores nulos ou NAN
print(y_test.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Age           418 non-null    float64
 1   SibSp         418 non-null    int64  
 2   Parch         418 non-null    int64  
 3   mulher        418 non-null    object 
 4   porto         418 non-null    object 
 5   crianca       418 non-null    int64  
 6   caroOrBarato  418 non-null    int64  
 7   Pclass_1      418 non-null    float64
 8   Pclass_2      418 non-null    float64
 9   Pclass_3      418 non-null    float64
dtypes: float64(4), int64(4), object(2)
memory usage: 32.8+ KB
None

Age             0
SibSp           0
Parch           0
mulher          0
porto           0
crianca         0
caroOrBarato    0
Pclass_1        0
Pclass_2        0
Pclass_3        0
dtype: int64


In [82]:
X_train.describe()

Unnamed: 0,Age,SibSp,Parch,crianca,caroOrBarato,Pclass_1,Pclass_2,Pclass_3
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,29.699118,0.523008,0.381594,0.076319,0.179574,0.242424,0.20651,0.551066
std,13.002015,1.102743,0.806057,0.265657,0.384047,0.42879,0.405028,0.497665
min,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29.699118,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,35.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
max,80.0,8.0,6.0,1.0,1.0,1.0,1.0,1.0


In [101]:
y_test.describe()

Unnamed: 0,Age,SibSp,Parch,crianca,caroOrBarato,Pclass_1,Pclass_2,Pclass_3
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,30.27259,0.447368,0.392344,0.055024,0.191388,0.255981,0.222488,0.521531
std,12.634534,0.89676,0.981429,0.2283,0.393865,0.436934,0.416416,0.500135
min,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,30.27259,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,35.75,1.0,0.0,0.0,0.0,1.0,0.0,1.0
max,76.0,8.0,9.0,1.0,1.0,1.0,1.0,1.0


In [110]:
# KNN classifier model
knn = KNeighborsClassifier()

# K-fold (k=5)
scores = cross_val_score(knn, X_train, target, cv=5, scoring='accuracy')

print(scores.mean())

# Results
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

knn.fit(X_train, target)

predictions = knn.predict(y_test) 

print(y_test.sum())
print(predictions)

# Resultado com oneHotEncoder
# 0.6981357102504551
# Accuracy: 0.70 (+/- 0.05)

# Resultado com atribuição
# 0.6835791852363317
# Accuracy: 0.68 (+/- 0.09)

# Sem a coluna Fare e inclusão da criança
# 0.7677044755508129
# Accuracy: 0.77 (+/- 0.05)

export_prediction(predictions, "submission_knn_1.csv")

0.7766744083861653
Accuracy: 0.78 (+/- 0.05)
Age             12653.942771
SibSp                    187
Parch                    164
mulher                   152
porto                    612
crianca                   23
caroOrBarato              80
Pclass_1               107.0
Pclass_2                93.0
Pclass_3               218.0
dtype: object
[0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 1 1 0 1 0 0 1 0 1 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 0 0 1 1 1 1 0 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 1 0 1 0
 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 

In [117]:
# X = pd.get_dummies(X_train[features])
# X_test = pd.get_dummies(X_train[features])


model_rfc = RandomForestClassifier(criterion = 'entropy', 
                                  n_estimators = 500, 
                                  max_depth = 5, 
                                  min_samples_split = 3, 
                                  min_samples_leaf = 1, 
                                  random_state = 0)

# K-fold (k=5)
scores = cross_val_score(model_rfc, X_train, target, cv=5, scoring='accuracy')

print(scores.mean())

# Results
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

model_rfc.fit(X_train, target)


predictions = model_rfc.predict(X_train) 

accuracy = accuracy_score(target, predictions)
print("Accuracy Score:", accuracy)

#predictions_test = model_rfc.predict(y_test) 

# print(y_test.sum())
# print(predictions)


# confusion_matrix(y_test, predictions)
#print(accuracy_score(y_test, predictions))

# Result
# 0.8215805661917018
# Accuracy: 0.82 (+/- 0.06)

# print(np.mean(score))

# model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# model.fit(X, y)

# predictions = model.predict(X_test)

# print(predictions)

#export_prediction(predictions, "submission_rfc_1.csv")

0.8215805661917018
Accuracy: 0.82 (+/- 0.06)
Accuracy Score: 0.8395061728395061


In [None]:
def export_prediction(prediction, name):
    output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': prediction})
    output.to_csv(name, index=False)
    print("Your submission was successfully saved!")