In [None]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
y_train = train_data['Survived']
x_train = train_data.drop(['Survived','PassengerId','Name' ,'Ticket', 'Cabin'], axis=1)
ids = test_data['PassengerId']
x_test = test_data.drop(['PassengerId','Name' ,'Ticket', 'Cabin'], axis=1)
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [None]:
labelEncoder1 = LabelEncoder()
x_train['Sex'] = labelEncoder1.fit_transform(x_train['Sex'])
x_test['Sex'] = labelEncoder1.transform(x_test['Sex'])

labelEncoder2 = LabelEncoder()
x_train['Embarked'] = labelEncoder2.fit_transform(x_train['Embarked'])
x_test['Embarked'] = labelEncoder2.transform(x_test['Embarked'])
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.2500,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.9250,2
3,1,0,35.0,1,0,53.1000,2
4,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,2
887,1,0,19.0,0,0,30.0000,2
888,3,0,,1,2,23.4500,2
889,1,1,26.0,0,0,30.0000,0


In [None]:
minmax = MinMaxScaler()
x_train[["Age", "Fare"]] = minmax.fit_transform(x_train[["Age", "Fare"]])
x_test[["Age", "Fare"]] = minmax.fit_transform(x_test[["Age", "Fare"]])

In [None]:
imputer = IterativeImputer(random_state=42, verbose=1)
train_imputed = pd.DataFrame(imputer.fit_transform(x_train), columns = ['Pclass', 'Sex' ,'Age', 'SibSp', 'Parch', 'Fare','Embarked'])
test_imputed = pd.DataFrame(imputer.transform(x_test), columns = ['Pclass', 'Sex' ,'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])

[IterativeImputer] Completing matrix with shape (891, 7)
[IterativeImputer] Change: 0.4633551783232223, scaled tolerance: 0.008 
[IterativeImputer] Change: 0.0, scaled tolerance: 0.008 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (418, 7)


In [None]:
run_gs = False

if run_gs:
    parameter_grid = {
                 'max_depth' : [2, 4, 6],
                 'n_estimators': [100, 50],
                 'criterion' : ['entropy', 'gini'],
                 'min_samples_split': [2, 4, 6],
                 'min_samples_leaf': [1, 3, 6]
                 }
    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    grid_search = GridSearchCV(forest,
                               scoring='accuracy',
                               param_grid=parameter_grid,
                               cv=cross_validation,
                               verbose=1
                              )

    grid_search.fit(train_imputed, y_train)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

estimator = RandomForestClassifier(max_depth=6, criterion='entropy' , min_samples_leaf=6, min_samples_split=2, 
                             n_estimators=50, bootstrap=True, random_state=42)
selector = RFECV(estimator, step=1, cv=cv, min_features_to_select=1)
selector = selector.fit(train_imputed, y_train)
selector.support_

array([ True,  True,  True,  True, False,  True, False])

In [None]:
train_imputed = train_imputed.drop(['Parch', 'Embarked'], axis=1)
test_imputed = test_imputed.drop(['Parch', 'Embarked'], axis=1)

In [None]:
clf = RandomForestClassifier(max_depth=6, criterion='entropy' , min_samples_leaf=6, min_samples_split=2, 
                             n_estimators=50, bootstrap=True, random_state=42)
clf.fit(train_imputed, y_train)
predictions = clf.predict(test_imputed)
print(predictions)

[0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1
 1 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 0]


In [None]:
output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output.to_csv('submission.csv', index=False)