In [1]:
import numpy as np
import pandas as pd

In [3]:
trainx = pd.read_csv('../input/titanic/train.csv')

In [4]:
testx = pd.read_csv('../input/titanic/test.csv')

In [5]:
trainy = np.array(trainx['Survived'].copy())

In [6]:
testid = np.array(testx['PassengerId'].copy())

In [7]:
trainx.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
labels = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [9]:
trainx = trainx[labels]

In [10]:
testx = testx[labels]

In [11]:
trainx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


In [12]:
testx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [13]:
trainx.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


In [14]:
from sklearn.pipeline import Pipeline

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
from sklearn.impute import SimpleImputer

In [17]:
pipelinenum = Pipeline([
    ('num_imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
pipelinecat = Pipeline([
    ('cat_imputer', SimpleImputer(strategy="most_frequent")),
    ('encode', OneHotEncoder())
])

In [20]:
from sklearn.compose import ColumnTransformer

In [21]:
from sklearn.preprocessing import PolynomialFeatures

In [22]:
labelnum = ["Pclass", "Age", "SibSp", "Parch", "Fare"]

In [23]:
labelcat = ["Sex", "Embarked"]

In [24]:
pipelinefull = ColumnTransformer([
    ("num_trans", pipelinenum, labelnum),
    ("cat_trans", pipelinecat, labelcat)
])

In [25]:
trainxfinal = pipelinefull.fit_transform(trainx)

In [26]:
def modelfit(model):
    model.fit(trainxfinal, trainy)
    pred = model.predict(trainxfinal) 
    print("Train accuracy: ", accuracy_score(pred, trainy))
    cv_scores = cross_val_score(model, trainxfinal, trainy, cv=3, scoring="accuracy")
    print("Validation accuracy: ", cv_scores.mean())

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
from sklearn.model_selection import cross_val_score

In [30]:
reg = LogisticRegression()

In [31]:
modelfit(reg)

Train accuracy:  0.7991021324354658
Validation accuracy:  0.7867564534231201


In [32]:
polyreg = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3, include_bias=False)),
    ("reg", LogisticRegression(penalty='elasticnet', l1_ratio=0.7, max_iter=100000, solver='saga')),
])

In [33]:
modelfit(polyreg)

Train accuracy:  0.8338945005611672
Validation accuracy:  0.8148148148148149


In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
forest = RandomForestClassifier()

In [36]:
modelfit(forest)

Train accuracy:  0.9797979797979798
Validation accuracy:  0.7934904601571269


In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
grid = [
    {'n_estimators':[150, 200, 250], 'max_features':[5, 10], 'max_depth':[5, 10, 20], 
     'min_samples_leaf':[5, 10, 20], 'min_samples_split':[5, 10, 20]}
]

In [39]:
forest = RandomForestClassifier()

In [40]:
gridsearch = GridSearchCV(forest, grid, cv=3, scoring='accuracy', return_train_score=True)

In [41]:
gridsearch.fit(trainxfinal, trainy)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid=[{'max_depth': [5, 10, 20], 'max_features': [5, 10],
                          'min_samples_leaf': [5, 10, 20],
                          'min_samples_split': [5, 10, 20],
                          'n_estimators': [150, 200, 250]}],
             return_train_score=True, scoring='accuracy')

In [42]:
bestsearch = gridsearch.best_estimator_

In [43]:
print(bestsearch)

RandomForestClassifier(max_depth=10, max_features=10, min_samples_leaf=5,
                       min_samples_split=5, n_estimators=150)


In [44]:
gridsearch.best_params_

{'max_depth': 10,
 'max_features': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'n_estimators': 150}

In [45]:
modelfit(bestsearch)

Train accuracy:  0.8843995510662177
Validation accuracy:  0.8249158249158249


In [46]:
from sklearn.neighbors import KNeighborsClassifier

In [47]:
kneighbors = KNeighborsClassifier(20)

In [48]:
modelfit(kneighbors)

Train accuracy:  0.8249158249158249
Validation accuracy:  0.8159371492704827


In [49]:
from sklearn.ensemble import VotingClassifier

In [51]:
vote = VotingClassifier(
    estimators=[('poly_logres', polyreg), ('grid_randomforest', bestsearch), 
                ('kneighbors', kneighbors)],
    voting='hard'
)

In [52]:
vote.fit(trainxfinal, trainy)

VotingClassifier(estimators=[('poly_logres',
                              Pipeline(steps=[('poly_features',
                                               PolynomialFeatures(degree=3,
                                                                  include_bias=False)),
                                              ('reg',
                                               LogisticRegression(l1_ratio=0.7,
                                                                  max_iter=100000,
                                                                  penalty='elasticnet',
                                                                  solver='saga'))])),
                             ('grid_randomforest',
                              RandomForestClassifier(max_depth=10,
                                                     max_features=10,
                                                     min_samples_leaf=5,
                                                     min_samples_split=5,
 

In [53]:
predicti = vote.predict(trainxfinal)

In [54]:
print(accuracy_score(predicti, trainy))

0.8619528619528619


In [55]:
cvscores = cross_val_score(vote, trainxfinal, trainy, cv=3, scoring="accuracy")

In [56]:
cvscores.mean()

0.8294051627384961

In [57]:
testxfinal = pipelinefull.transform(testx)

In [58]:
vote.fit(trainxfinal, trainy)

VotingClassifier(estimators=[('poly_logres',
                              Pipeline(steps=[('poly_features',
                                               PolynomialFeatures(degree=3,
                                                                  include_bias=False)),
                                              ('reg',
                                               LogisticRegression(l1_ratio=0.7,
                                                                  max_iter=100000,
                                                                  penalty='elasticnet',
                                                                  solver='saga'))])),
                             ('grid_randomforest',
                              RandomForestClassifier(max_depth=10,
                                                     max_features=10,
                                                     min_samples_leaf=5,
                                                     min_samples_split=5,
 

In [59]:
test_pred = vote.predict(testxfinal)

In [60]:
submit = pd.DataFrame({
    'PassengerId':testid,
    'Survived':test_pred
})

In [61]:
submit.to_csv("results.csv", index=False)