In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

In [2]:
titanic = pd.read_csv('/kaggle/input/titanic/train.csv')
sub_set = pd.read_csv('/kaggle/input/titanic/test.csv')

In [3]:
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
titanic.select_dtypes(include=['int64', 'float64']).describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
titanic.select_dtypes(include=['object']).describe()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


## **1 - Handling missing data**

There's missing data in 3 columns : 
- Age
- Cabin
- Embarked

There's too much missing data for Cabin, we will just drop the column, for Embarked, we will fill the NaN's by the most frequent value (or search for people with the same family name as the people with missing value)
For Age, we will replace the NaN's by the average of age.

In [7]:
titanic_cab = titanic
titanic_cab['Cabin'] = titanic_cab['Cabin'].isnull().astype('int64')

#She seems to be french

In [8]:
titanic_cab.loc[61, 'Embarked'] = 'C'

In [9]:
name_filter = titanic_cab['Name'].str.contains('Stone')
titanic_cab[name_filter]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
319,320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corn...",female,40.0,1,1,16966,134.5,0,C
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,0,


In [10]:
titanic_cab.loc[829, 'Embarked'] = titanic_cab['Embarked'].mode()[0]

In [11]:
titanic_cab['Age'] = titanic_cab['Age'].fillna(value = titanic_cab['Age'].mean())

In [12]:
titanic_cab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    int64  
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 83.7+ KB


In [13]:
dummies_sex = pd.get_dummies(titanic_cab['Sex'], dtype=int)
dummies_pclass = pd.get_dummies(titanic_cab['Pclass'], dtype=int)
dummy_cabin = pd.get_dummies(titanic_cab['Cabin'], dummy_na=True, dtype=int)

titanic_prp = pd.concat([titanic_cab, dummies_sex, dummies_pclass], axis=1)

titanic_prp['Embarked'] = pd.factorize(titanic_prp['Embarked'])[0] + 1

In [14]:
titanic_prp = titanic_prp.drop(columns=['Ticket', 'Name', 'Sex', 'Pclass', 'Cabin'])

In [15]:
def scale(df, column, method='normalize'):
    if method == 'scale':
        return (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return (df[column] - df[column].mean()) / df[column].std()

titanic_prp['Age'] = scale(titanic_prp, 'Age', method='scale')
titanic_prp['Fare'] = scale(titanic_prp, 'Fare', method='scale')

In [16]:
titanic_prp

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Embarked,female,male,1,2,3
0,1,0,0.271174,1,0,0.014151,1,0,1,0,0,1
1,2,1,0.472229,1,0,0.139136,2,1,0,1,0,0
2,3,1,0.321438,0,0,0.015469,1,1,0,0,0,1
3,4,1,0.434531,1,0,0.103644,1,1,0,1,0,0
4,5,0,0.434531,0,0,0.015713,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,0.334004,0,0,0.025374,1,0,1,0,1,0
887,888,1,0.233476,0,0,0.058556,1,1,0,1,0,0
888,889,0,0.367921,1,2,0.045771,1,1,0,0,0,1
889,890,1,0.321438,0,0,0.058556,2,0,1,1,0,0


In [17]:
X = np.array(titanic_prp.drop(columns=['PassengerId', 'Survived']))
y = np.array(titanic_prp['Survived'])

In [18]:
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=0.1)
X_val, X_test, y_val, y_test = train_test_split(X_, y_, test_size=0.5)

In [19]:
def feature_importance(model, features_names):
    return pd.DataFrame({'Feature': features_names, 'Importance': model.coef_.tolist()[0]}).set_index('Feature')

In [20]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier

boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.06, max_depth=5, random_state=0)

boost = boost.fit(X_train, y_train)

print(boost.score(X_test, y_test), cross_val_score(boost, X_val, y_val))

0.8222222222222222 [0.66666667 0.55555556 0.66666667 0.88888889 1.        ]


In [21]:
hist = HistGradientBoostingClassifier()

hist = hist.fit(X_train, y_train)

print(hist.score(X_test, y_test), cross_val_score(hist, X_val, y_val))

0.8444444444444444 [0.77777778 0.66666667 0.66666667 0.66666667 0.66666667]


In [22]:
forest = RandomForestClassifier(n_estimators=10000, max_depth=6, random_state=0)

forest = forest.fit(X_train, y_train)

print(forest.score(X_test, y_test), cross_val_score(forest, X_val, y_val))

0.8666666666666667 [0.77777778 0.66666667 0.77777778 0.77777778 1.        ]


In [23]:
sub_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [24]:
sub_set_p = sub_set
sub_set_p['Cabin'] = sub_set_p['Cabin'].isnull().astype('int64')
sub_set_p['Age'] = sub_set_p['Age'].fillna(value = titanic_cab['Age'].mean())
sub_set_p['Fare'] = sub_set_p['Fare'].fillna(value = titanic_cab['Fare'].mean())
dummies_sex = pd.get_dummies(sub_set_p['Sex'], dtype=int)
dummies_pclass = pd.get_dummies(sub_set_p['Pclass'], dtype=int)
dummy_cabin = pd.get_dummies(sub_set_p['Cabin'], dummy_na=True, dtype=int)


sub_set_p = pd.concat([sub_set_p, dummies_sex, dummies_pclass], axis=1)

sub_set_p['Embarked'] = pd.factorize(sub_set_p['Embarked'])[0] + 1

sub_set_p = sub_set_p.drop(columns=['Ticket', 'Name', 'Sex', 'Pclass', 'Cabin'])

sub_set_p['Age'] = scale(sub_set_p, 'Age', method='scale')
sub_set_p['Fare'] = scale(sub_set_p, 'Fare', method='scale')

In [25]:
sub_set_p

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Embarked,female,male,1,2,3
0,892,0.452723,0,0,0.015282,1,0,1,0,0,1
1,893,0.617566,1,0,0.013663,2,1,0,0,0,1
2,894,0.815377,0,0,0.018909,1,0,1,0,1,0
3,895,0.353818,0,0,0.016908,2,0,1,0,0,1
4,896,0.287881,1,1,0.023984,2,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0.389412,0,0,0.015713,2,0,1,0,0,1
414,1306,0.512066,0,0,0.212559,3,1,0,1,0,0
415,1307,0.505473,0,0,0.014151,2,0,1,0,0,1
416,1308,0.389412,0,0,0.015713,2,0,1,0,0,1


In [26]:
id = np.array(sub_set_p['PassengerId'])

In [27]:
sub_set_p.drop(['PassengerId'], axis=1, inplace=True)

In [28]:
X_sub = np.array(sub_set_p)

In [29]:
forest = RandomForestClassifier(n_estimators=10000, max_depth=6, random_state=0)

model = forest.fit(X, y)

predictions = model.predict(X_sub)

In [30]:
submission = pd.DataFrame({'PassengerId': id, 'Survived': predictions}).set_index('PassengerId')
submission.to_csv('titanic.csv')