In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import Packages

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn

In [61]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, mean_squared_error
import xgboost

### Import Data

In [4]:
train = pd.read_csv('D:/Users/owner/Desktop/Kaggle/train.csv')
test = pd.read_csv('D:/Users/owner/Desktop/Kaggle/test.csv')

In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [43]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
PassengerId        891 non-null int64
Survived           891 non-null int64
Pclass             891 non-null int64
Name               891 non-null object
Sex                891 non-null int32
Age                891 non-null float64
SibSp              891 non-null int64
Parch              891 non-null int64
Ticket             891 non-null object
Fare               891 non-null float64
Cabin              204 non-null object
Embarked           891 non-null int32
Title              891 non-null int32
Age_category       891 non-null int32
Family             891 non-null int64
Alone              891 non-null int64
fare_category      891 non-null int32
fare_per_person    891 non-null float64
cabin_or_not       891 non-null int64
dtypes: float64(3), int32(5), int64(8), object(3)
memory usage: 114.9+ KB


In [44]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 18 columns):
PassengerId        418 non-null int64
Pclass             418 non-null int64
Name               418 non-null object
Sex                418 non-null int64
Age                418 non-null float64
SibSp              418 non-null int64
Parch              418 non-null int64
Ticket             418 non-null object
Fare               417 non-null float64
Cabin              91 non-null object
Embarked           418 non-null int32
Title              418 non-null int32
Age_category       418 non-null int32
Family             418 non-null int64
Alone              418 non-null int64
fare_category      418 non-null int32
fare_per_person    417 non-null float64
cabin_or_not       418 non-null int64
dtypes: float64(3), int32(4), int64(8), object(3)
memory usage: 52.3+ KB


### Data Preprocessing

In [7]:
# Name

# collecting titles from their names
# import regular expression
# .group() 匹配的整個表達式的字符串，group() 可以一次輸入多個組號，在這種情況下它將返回一個包含那些組所對應值的元組。

import re
title = []
for i in range(0, len(train)):
    title.append(re.search(' ([A-Za-z]+)\.', train['Name'][i]).group(1))
title = pd.Series(title)

title=title.replace(['Countess', 'Capt', 'Col', 'Don', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Others')
title=title.replace('Mlle','Miss')
title=title.replace('Lady','Miss')
title=title.replace('Ms','Miss')
title=title.replace('Mme','Mrs')

title=title.replace('Sir','Mr')
title=title.replace('Dr','Mr')

title.value_counts()

Mr        525
Miss      186
Mrs       126
Master     40
Others     14
dtype: int64

In [8]:
# Give different title number
# .map(function, iterable,)
train['Title'] = title
dic_title = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Others": 5}
train['Title'] = train['Title'].map(dic_title).astype(int)

In [9]:
# Sex
dic_sex = {"female": 0, "male": 1}
train['Sex'] = train['Sex'].map(dic_sex).astype(int)

# Age
train['Age'].fillna(train['Age'].mean(),inplace=True)
# Divid Age to 6 group
# .qcut()
train['Age_category'] = pd.qcut(train['Age'], 6)
train['Age_category'] = train['Age_category'].astype('category').cat.codes.astype('int')

In [10]:
# Family_number - use "SibSp、Parch"
train['Family'] = train['SibSp'] + train['Parch'] + 1

# Passenger going alone
train['Alone'] = 0
train.loc[train['Family'] == 1, 'Alone'] = 1

In [11]:
# Ticket、Fare
# dividing fare into 5 catogories
train['fare_category'] = pd.qcut(train['Fare'], 5)
train['fare_category'] = train['fare_category'].astype('category').cat.codes.astype('int')

train['fare_per_person'] = train['Fare']/train['Family']

# Cabin - passenger has a cabin or not
train['cabin_or_not'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Embarked
# fillna all Na to 'S' - because S is most
train['Embarked'].fillna('S', inplace=True)

dic_embarked = {'S': 0, 'C': 1, 'Q': 2}
train['Embarked'] = train['Embarked'].map(dic_embarked).astype(int)

In [13]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Age_category,Family,Alone,fare_category,fare_per_person,cabin_or_not
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,0,1,1,2,0,0,3.625,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,3,4,2,0,4,35.64165,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,0,2,2,1,1,1,7.925,0


Use the same way to clean the test data

In [14]:
import re
title = []
for i in range(0, len(test)):
    title.append(re.search(' ([A-Za-z]+)\.', test['Name'][i]).group(1))
title = pd.Series(title)

title=title.replace(['Countess', 'Capt', 'Col', 'Don', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Others')
title=title.replace('Mlle','Miss')
title=title.replace('Lady','Miss')
title=title.replace('Ms','Miss')
title=title.replace('Mme','Mrs')
title=title.replace('Sir','Mr')
title=title.replace('Dr','Mr')

test['Title'] = title
test['Title'] = test['Title'].map(dic_title).astype(int)

In [15]:
test['Sex'] = test['Sex'].map(dic_sex)

In [16]:
test['Age'].fillna(test['Age'].mean(),inplace=True)
test['Age_category'] = pd.qcut(test['Age'], 6, duplicates="drop")
test['Age_category'] = test['Age_category'].astype('category').cat.codes.astype('int')

In [17]:
test['Family'] = test['SibSp'] + test['Parch'] + 1
test['Alone'] = 0
test.loc[test['Family'] == 1, 'Alone'] = 1

In [18]:
test['fare_category'] = pd.qcut(test['Fare'], 5)
test['fare_category'] = test['fare_category'].astype('category').cat.codes.astype('int')

In [19]:
test['fare_per_person'] = test['Fare']/test['Family']

In [20]:
test['cabin_or_not'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

In [21]:
test['Embarked'].fillna('S', inplace=True)
test['Embarked'] = test['Embarked'].map(dic_embarked).astype(int)

In [23]:
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Age_category,Family,Alone,fare_category,fare_per_person,cabin_or_not
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,2,1,3,1,1,1,7.8292,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,0,3,4,2,0,0,3.5,0
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,2,1,4,1,1,1,9.6875,0


Drop the data don't use, Define the final data

In [24]:
# Drop the data don't use, Define the final data
y_train = train['Survived']

In [25]:
X_train = train.drop(columns=['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])
X_test = test.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

In [26]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age_category,Family,Alone,fare_category,fare_per_person,cabin_or_not
0,3,1,22.0,1,0,7.25,0,1,1,2,0,0,3.625,0
1,1,0,38.0,1,0,71.2833,1,3,4,2,0,4,35.64165,1
2,3,0,26.0,0,0,7.925,0,2,2,1,1,1,7.925,0
3,1,0,35.0,1,0,53.1,0,3,4,2,0,4,26.55,1
4,3,1,35.0,0,0,8.05,0,1,4,1,1,1,8.05,0


In [27]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Age_category,Family,Alone,fare_category,fare_per_person,cabin_or_not
0,3,1,34.5,0,0,7.8292,2,1,3,1,1,1,7.8292,0
1,3,0,47.0,1,0,7.0,0,3,4,2,0,0,3.5,0
2,2,1,62.0,0,0,9.6875,2,1,4,1,1,1,9.6875,0
3,3,1,27.0,0,0,8.6625,0,1,2,1,1,1,8.6625,0
4,3,0,22.0,1,1,12.2875,0,3,1,3,0,2,4.095833,0


### Preprocessing Pipeline

In [28]:
num_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'Age_category',
           'Family', 'Alone', 'fare_category', 'fare_per_person', 'cabin_or_not']

# imputer
median_imputer = SimpleImputer(strategy='median')
frequent_imputer = SimpleImputer(strategy='most_frequent')

# scaler
std_scaler = StandardScaler()

# pipeline
num_pip = Pipeline([
    ('imputer', median_imputer),
    ('scaler', std_scaler)
])

# column trasnformer
col_transformer = ColumnTransformer([
    ('num', num_pip, num_cols),])

# transform X_train, X_test
X_train_tf = col_transformer.fit_transform(X_train)
X_test_tf = col_transformer.transform(X_test)
print(X_train_tf.shape, X_test_tf.shape)

(891, 14) (418, 14)


### Modeling

In [29]:
X_train_tf, X_valid, y_train, y_valid = train_test_split(X_train_tf, y_train, shuffle=True)

#### Logistic Regression

In [57]:
log_model = LogisticRegression(C=25).fit(X_train_tf, y_train)
print(log_model.coef_)

[[-0.45053826 -1.1771197  -1.34542134 -0.5254053  -0.21003642  0.24381363
   0.18762934  0.43982341  0.83207481 -0.46402718 -0.30046618  0.14610232
  -0.12878098  0.3561598 ]]


In [58]:
accuracy_score(y_valid, log_model.predict(X_valid))

0.8026905829596412

In [60]:
# make predictions for test data -> Output
y_pred_log = log_model.predict(X_test_tf)
pd.DataFrame(y_pred_log).to_csv('D:/Users/owner/Desktop/y_pred_log.csv')

#### XGBoost

In [32]:
xgboost_grid_params = {
    'n_estimators':[650, 670, 700],
    'gamma': [0.001, 0.005, 0.01],
    'learning_rate':[0.01, 0.015],
    'max_depth':[4],
    'min_child_weight':[1]
}
model = GridSearchCV(
    xgboost.XGBClassifier(),
    param_grid = xgboost_grid_params,
    cv = 5,
    n_jobs=-1
)
model.fit(X_train_tf, y_train)

GridSearchCV(cv=5, estimator=XGBClassifier(), n_jobs=-1,
             param_grid={'gamma': [0.001, 0.005, 0.01],
                         'learning_rate': [0.01, 0.015], 'max_depth': [4],
                         'min_child_weight': [1],
                         'n_estimators': [650, 670, 700]})

In [33]:
xgboost_model = model.best_estimator_
print('XGBoost Model:')
print(model.best_params_)
print(model.best_score_)

XGBoost Model:
{'gamma': 0.001, 'learning_rate': 0.01, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 670}
0.8397598473796432


In [35]:
accuracy_score(y_valid, model.predict(X_valid))

0.8026905829596412

In [45]:
# make predictions for test data -> Output
y_pred_xgboost = model.predict(X_test_tf)
pd.DataFrame(y_pred_xgboost).to_csv('D:/Users/owner/Desktop/y_pred_xgboost.csv')

Neural Networks