In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
sub.head()

In [None]:
train_df.Survived.value_counts()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
df = pd.concat([train_df,test_df],ignore_index=False)
df.info()

In [None]:
print('==train_df==\n',train_df.dtypes.value_counts())
print('\n==test_df==\n',test_df.dtypes.value_counts())
print('\n==df==\n',df.dtypes.value_counts())

In [None]:
#Function to output the shape of the data frame
def df_shape(df):
    total_cel = np.product(df.shape)

    cols_with_missing = df.isnull().sum()
    total_missing = cols_with_missing.sum()
    print('missing_parcent : {:.2f}%'.format(total_missing/total_cel*100))
    print('original shape :',df.shape)
    print('\n==Use dropna axis=0==')
    print('shape :',df.dropna(axis=0).shape)
    print('Rows in original : %d' % df.shape[0])
    print('Rows with na\'s dropped %d' % df.dropna(axis=0).shape[0])
    print('Loss Rows :',df.shape[0]-(df.dropna(axis=0).shape[0]))
    print('\n==Use dropna axis=1==')
    print('shape :',df.dropna(axis=1).shape)
    print('Columns in original : %d' % df.shape[1])
    print('Columns with na\'s dropped %d' % df.dropna(axis=1).shape[1])
    print('Loss Columns :',df.shape[1]-(df.dropna(axis=1).shape[1]))

In [None]:
#train_df
df_shape(train_df)

In [None]:
#test_df
df_shape(test_df)

In [None]:
#df
df_shape(df)

In [None]:
#Functions containing missing values and functions that output the missing rate
def get_missing(df):
    return [(c,(df[c].isnull().sum()/len(df))*100) 
            for c in df.columns.tolist() if df[c].isnull().sum()>0]

missing_df = pd.DataFrame(get_missing(df),columns=["features","missing_parcent"]).sort_values(by='missing_parcent',ascending=False)
missing_train_df = pd.DataFrame(get_missing(train_df),columns=["features","missing_parcent"]).sort_values(by='missing_parcent',ascending=False)
missing_test_df = pd.DataFrame(get_missing(test_df),columns=["features","missing_parcent"]).sort_values(by='missing_parcent',ascending=False)

fig,ax=plt.subplots(1,3,figsize=(19,2),dpi=80)
sns.barplot(x='missing_parcent',y='features',data=missing_df,ax=ax[2])
ax[2].set_title('Rotio of missing velues of df',fontsize=12)

sns.barplot(x='missing_parcent',y='features',data=missing_test_df,ax=ax[1])
ax[1].set_title('Rotio of missing velues of test_df',fontsize=12)

sns.barplot(x='missing_parcent',y='features',data=missing_train_df,ax=ax[0])
ax[0].set_title('Rotio of missing velues of train_df',fontsize=12)
print('==train_df==',missing_train_df)
print('\n==test_df==',missing_test_df)
print('\n==df==',missing_df)

In [None]:
X = df.copy()
X.dropna(subset=["Survived"],axis=0,inplace=True)
y = X.pop('Survived')
X = X.select_dtypes(exclude='object')
X.Age.fillna(X.Age.median(),inplace=True)
X.isnull().sum()

In [None]:
#Check the function score
from sklearn.feature_selection import mutual_info_classif

discrete_features = X.dtypes==int

def make_mi_score(X,y,discrete_features):
    mi_score = mutual_info_classif(X,y,discrete_features = discrete_features)
    mi_score = pd.Series(mi_score,name='MI Score',index=X.columns)
    mi_score = mi_score.sort_values(ascending=False)
    return mi_score

mi_score = make_mi_score(X,y,discrete_features)

def plot_mi_score(scores):
    socres = mi_score.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width,scores)
    plt.yticks(width,ticks)
    plt.title('Mutual Infomation Scores',fontsize=14)
# plt.figure(figsize=(10,4),dpi=100)
# plot_mi_score(mi_score)
mi_score_df = pd.DataFrame(mi_score)
mi_score_df

In [None]:
#rfc feature importance score
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=1)
rfc.fit(X,y)
fi = rfc.feature_importances_
idx = np.argsort(fi)[::-1]
top_cols,top_importances = X.columns.values[idx],fi[idx]
fig,ax = plt.subplots(1,2,figsize=(14,3),dpi=90)


plot_mi_score(mi_score.sort_values(ascending=True))
sns.barplot(x=top_importances,y=top_cols,ax=ax[0])
ax[0].set_title('Rfc Feature Importances',fontsize=14)
importance_df= pd.DataFrame(top_importances,top_cols).rename(columns={0:'importance'})
feature_selection_df = mi_score_df.join(importance_df)


X.columns.tolist()
dfdf = df[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare','Survived']].dropna(subset=["Survived"],axis=0)
plt.figure(figsize=(10,3),dpi=90)
sns.heatmap(dfdf.corr().round(2),annot=True,
            fmt='g',cmap='Blues')
plt.title('Correlation of features',fontsize=14)


#Variance inflation factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()

vif["Vif_factor"] = [variance_inflation_factor(X.values,i) for  i in range(X.shape[1])]
vif["Vif_factor"]

vif["features"] = X.columns
vif = vif.set_index('features')
vif

feature_selection_df = feature_selection_df.join(vif)
feature_selection_df

In [None]:
feature_selection_df

In [None]:
df.isnull().sum()

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,3))

cols = ["Fare","Age"]
for i,c in enumerate(cols):
    ax[i].hist(df[c],bins=20)
    ax[i].set_title(f'distribution of {c}')
    
print('Fare\n',df.Fare.describe())
print('\nAge\n',df.Age.describe())

In [None]:
for c in cols:
    df[c].fillna(df[c].median(),inplace=True)
df.isnull().sum()

In [None]:
df.Embarked.fillna('S',inplace=True)

In [None]:
df["Log_Fare"] = df.Fare.apply(np.log1p)

fig,ax = plt.subplots(1,2,figsize=(10,3))

cols = ["Fare","Log_Fare"]
for i,c in enumerate(cols):
    sns.kdeplot(data=df[c],shade=True,ax=ax[i])

In [None]:
from  xgboost import XGBClassifier 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

X_tr,X_val,y_tr,y_val = train_test_split(X,y,
                                        random_state=2,shuffle=True,stratify=y)


def score_dataset(X,y,model = XGBClassifier()):
    score = cross_val_score(model,X,y,cv=5,scoring='accuracy',)
    score = score.mean()*100
    return 'accuracy :{:.2f}%'.format(score)

In [None]:
#aproach_1 original_num_features val_score
score_dataset(X,y)

In [None]:
df.select_dtypes('object').isnull().sum()

In [None]:
df.select_dtypes('object').columns.tolist()
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [None]:
df.select_dtypes(exclude='object').columns.tolist()
['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [None]:
#Visualize the relationship between functions and targets
cols_1 = ['Sex','Embarked','Pclass','SibSp','Parch']
cols_2 = ['Age','Fare']

fig,ax=plt.subplots(1,5,figsize=(25,4))
for i,c in enumerate(cols_1):
    sns.countplot(x=c,hue='Survived',data=df,ax=ax[i])
    
fig,ax=plt.subplots(1,2,figsize=(16,4))
ax[0].hist(df.loc[df["Survived"]==0,'Age'].dropna(),
         bins=16,alpha=0.5,label='0')
ax[0].hist(df.loc[df["Survived"]==1,'Age'].dropna(),
         bins=16,alpha=0.5,label='1')
ax[0].legend(loc='upper right',title='Survived')
ax[0].grid(ls=':')
ax[0].set_title('Relatioship of Survived & Age',fontsize=14)

ax[1].hist(df.loc[df["Survived"]==0,'Fare'].dropna(),
         range=(0,350),bins=35,alpha=0.5,label='0')
ax[1].hist(df.loc[df["Survived"]==1,'Fare'].dropna(),
         range=(0,350),bins=35,alpha=0.5,label='1')
ax[1].legend(loc='upper right',title='Survived')
ax[1].grid(ls=':')
ax[1].set_title('Relatioship of Survived & Fare',fontsize=14)    

In [None]:
# I will try create some features which is more efective
edge_bins = [-float('inf'),5,10,25,30,52,65,float('inf')]
df["Age_bin"] = pd.cut(df.Age,edge_bins,labels=False)

fig,ax = plt.subplots(1,2,figsize=(22,5),dpi=100)
sns.countplot(x="Age_bin",hue='Survived',data=df,ax=ax[0])
ax[0].set_title('Relationship of Survuved & Age_bin',fontsize=14)
ax[0].grid(ls=':')

sns.heatmap(df.select_dtypes(exclude='object').corr().round(3),annot=True,
            fmt='g',cmap='Blues',ax=ax[1])
ax[1].set_title('Correlation of each features',fontsize=14)

In [None]:
df["FamilySize"] = df["SibSp"]+df["Parch"]+1

fig,ax = plt.subplots(1,2,figsize=(22,5),dpi=100)
sns.countplot(x="FamilySize",hue='Survived',data=df,ax=ax[0])
ax[0].set_title('Relationship of Survuved & FamilySize',fontsize=14)
ax[0].grid(ls=':')

sns.heatmap(df.select_dtypes(exclude='object').corr().round(3),annot=True,
            fmt='g',cmap='Blues',ax=ax[1])
ax[1].set_title('Correlation of each features',fontsize=14)

#create effective features is so difficult for me now!

In [None]:
edge_bins = [-float('inf'),1,4,7,float('inf')]
df["FamilySize_bin"] = pd.cut(df["FamilySize"],edge_bins,labels=False)

fig,ax = plt.subplots(1,2,figsize=(22,5),dpi=100)

sns.countplot(x='FamilySize_bin',hue='Survived',data=df,ax=ax[0])
ax[0].set_title('Relatonship of Survived & FamilySize_bin',fontsize=14)
ax[0].legend(loc='upper right',title='Survived')
ax[0].grid(ls=':')

sns.heatmap(df.select_dtypes(exclude='object').corr().round(3),annot=True,
            fmt = 'g',cmap='Blues',ax=ax[1])
ax[1].set_title('Correlation of each features',fontsize=14)

In [None]:
df.Ticket.value_counts()

In [None]:
df["Ticket_count"] = df.groupby('Ticket')['PassengerId'].transform('count')
df["Ticket_count"].value_counts()

In [None]:
fig,ax = plt.subplots(1,2,figsize=(22,5),dpi=100)

sns.countplot(x='Ticket_count',hue='Survived',data=df,ax=ax[0])
ax[0].set_title('Relatonship of Survived & Ticket_count',fontsize=14)
ax[0].legend(loc='upper right',title='Survived')
ax[0].grid(ls=':')

sns.heatmap(df.select_dtypes(exclude='object').corr().round(3),annot=True,
            fmt = 'g',cmap='Blues',ax=ax[1])
ax[1].set_title('Correlation of each features',fontsize=14)

In [None]:
# pd.options.display.max_columns=None
# pd.options.display.max_rows=300
df.Name.head(100)

In [None]:
#Extract the function to express Sex more concretely
import re

def get_title(name):
    title_search = re.search(r'([A-Za-z]+)\.',name)
    if title_search:
        return title_search.group(1)
    return ''

df["Title"] = df.Name.apply(get_title)
df["Title"].value_counts()

In [None]:
df["Title"].replace(["Rev","Dr","Col","Major","Lady","Sir","Don","Capt","Countess","Jonkheer","Dona"],"Rare",inplace=True)
df["Title"].replace(["Ms","Mlle"],"Miss",inplace=True)
df["Title"].replace(["Mme"],'Mrs',inplace=True)

sns.countplot(x='Title',hue='Survived',data=df)
plt.title('Relationship of Survived & Title',fontsize=14)
plt.grid(ls=':')
df.Title.value_counts()

In [None]:
delate_cols = ["PassengerId","Name","Cabin"]
df1 = df.drop(delate_cols,axis=1)

for c in df1.select_dtypes('object').columns:
    df1[c],_ = df1[c].factorize()
    
train = df1[:len(train_df)]
test = df1[len(train_df):]

X = train.copy()
y = X.pop('Survived') 
X_test = test.copy()
y_test = X_test.pop('Survived')

from sklearn.preprocessing import StandardScaler
cols = ["Age","Fare"]

scaler = StandardScaler()
X[cols] = scaler.fit_transform(X[cols])
X_test[cols] = scaler.fit_transform(X_test[cols])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


rfc = RandomForestClassifier(n_estimators=100,max_depth=5,random_state=40)
etc = ExtraTreesClassifier(n_estimators=100,max_depth=5,random_state=41)
dtc = DecisionTreeClassifier(max_depth=5)
lgr = LogisticRegression(C=10)

X_tr,X_val,y_tr,y_val = train_test_split(X,y,
                                        random_state=18,shuffle=True,stratify=y)
def score_dataset(model,X_tr,X_val,y_tr,y_val):
    model.fit(X_tr,y_tr)
    pred_val = model.predict(X_val)
    score = accuracy_score(y_val,pred_val)
    print('accuracy : {:.3f}%' .format(score))

In [None]:
print('==rfc==')
score_dataset(rfc,X_tr,X_val,y_tr,y_val)

print('==etc==')
score_dataset(etc,X_tr,X_val,y_tr,y_val)

print('==dtc==')
score_dataset(etc,X_tr,X_val,y_tr,y_val)

print('==lgr==')
score_dataset(lgr,X_tr,X_val,y_tr,y_val)

In [None]:
sub["Survived"] = list(lgr.predict(X_test).astype(int))
sub.to_csv('lgc.csv',index=False)

In [None]:
sub_lgr = pd.read_csv('../working/lgc.csv')
sub_lgr.head()

In [None]:
y_pred_etc = etc.predict(X_test)
y_pred_etc = y_pred_etc.astype(int)
sub["Survived"] = y_pred_etc

sub.to_csv('sub_etc.csv',index=False)
sub_etc = pd.read_csv('../working/sub_etc.csv')

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=68)
for fold_id,(train_index,valid_index) in enumerate(skf.split(X,y)):
    X_tr = X.loc[train_index,:]
    X_val = X.loc[valid_index,:]
    y_tr = y.loc[train_index]
    y_val = y.loc[valid_index]
    
    print(f'\nfold_id:{fold_id}')
    print(f'y_tr y==1 rate:{sum(y_tr)/len(y_tr)}')
    print(f'y_val y==1 rate:{sum(y_val)/len(y_val)}')

In [None]:
len(X_tr),len(X_val),len(y_tr),len(y_val)

In [None]:
print('\n==rfc_skf==')
score_dataset(rfc,X_tr,X_val,y_tr,y_val)

print('\n==etc_skf==')
score_dataset(etc,X_tr,X_val,y_tr,y_val)

print('\n==dtc_skf==')
score_dataset(dtc,X_tr,X_val,y_tr,y_val)

print('\n==lgr_skf==')
score_dataset(lgr,X_tr,X_val,y_tr,y_val)

In [None]:
y_pred = rfc.predict(X_test)
y_pred = y_pred.astype(int)
sub["Survived"] = list(y_pred)
sub.to_csv('sub_rfc_skf',index=False)
sub_rfc_skf = pd.read_csv('../working/sub_rfc_skf')
sub_rfc_skf.head()

In [None]:
df.Cabin.value_counts()

In [None]:
#Let's try predict Cabin!
cabin_df = df[["Cabin","Pclass","Sex","SibSp","Parch"]]
cabin_df["Sex"] = cabin_df.Sex.replace(["male","female"],[0,1])
known_cabin = cabin_df[cabin_df.Cabin.notnull()].values
unknown_cabin = cabin_df[cabin_df.Cabin.isnull()].values

y = known_cabin[:,0]
X = known_cabin[:,1:]
X_test = unknown_cabin[:,1:]


rfc.fit(X,y)
pred = rfc.predict(X_test)

len(df.loc[df.Cabin.isnull(),'Cabin']),len(pred)
df.loc[df.Cabin.isnull(),'Cabin'] = pred
df.Cabin.value_counts()

In [None]:
def get_area(area):
    area_search = re.search(r'([A-Z])\d+',area)
    if area_search:
        return area_search.group(1)
    return ''

df["Area"] = df.Cabin.apply(get_area)

def get_room(room):
    room_search = re.search(r'[A-Z](\d+)',room)
    
    if room_search:
        return room_search.group(1)
    return ''

df["room"] = df.Cabin.apply(get_room)

In [None]:
sns.countplot(x='Area',hue='Survived',data=df)

In [None]:
df.Area.value_counts()

In [None]:
pd.options.display.max_columns=None
pd.options.display.max_rows=300

In [None]:
df[:20]

In [None]:
#The processing was bad! There is a blank
df.Area.head(100)

In [None]:
delate_cols2 = ['PassengerId', 'Name']

In [None]:
df2 = df.drop(delate_cols2,axis=1)
df2.head()

In [None]:
train = df2[:len(train_df)]
test = df2[len(train_df):]

train.head(3)

In [None]:
test.head(3)

In [None]:
X = train.copy()
y = X.pop('Survived')

X_test = test.copy()
y_test = X_test.pop('Survived')

for c in X.select_dtypes('object').columns:
    X[c],_ = X[c].factorize()
    X_test[c],_ = X_test[c].factorize()

In [None]:
X.head()

In [None]:
XX = X.copy()
XX["Survived"]=y

plt.figure(figsize=(18,6))
sns.heatmap(XX.corr().round(3),annot=True,
            fmt='g',cmap='Blues')
plt.title('Correlation of each featuers',fontsize=14)

In [None]:
fig,ax=plt.subplots(2,2,figsize=(14,10))

sns.swarmplot(x='Fare',y='Embarked',data=df2,ax=ax[0,0])
ax[0,0].set_title('Fare & Embarked',fontsize=14)

sns.swarmplot(x='Fare',y='Sex',data=df2,ax=ax[0,1])
ax[0,1].set_title('Fare & Sex',fontsize=14)

sns.swarmplot(x='Fare',y='Title',data=df2,ax=ax[1,0])
ax[1,0].set_title('Fare & Title',fontsize=14)

sns.swarmplot(x='Fare',y='Area',data=df2,ax=ax[1,1])
ax[1,1].set_title('Fare & Area',fontsize=14)

In [None]:
discrete_features = X.dtypes==int

In [None]:
mi_scores = make_mi_score(X,y,discrete_features)
mi_df = pd.DataFrame(mi_scores)

rfc.fit(X,y)
fi = rfc.feature_importances_

idx = np.argsort(fi)[::-1]
top_cols,top_importances = X.columns.values[idx][:19],fi[idx][:19]
top_cols,top_importances

fig,ax = plt.subplots(1,2,figsize=(18,5),dpi=100)
plot_mi_score(mi_scores)

sns.barplot(x=top_importances,y=top_cols,ax=ax[0])
ax[0].set_title('rfc featuer importances',fontsize=14)

fi_df = pd.DataFrame(top_importances,top_cols)
fi_df = fi_df.rename(columns={0:"importance"})
mi_df.join(fi_df)

In [None]:
skf =StratifiedKFold(n_splits=5,shuffle=True,random_state=68)

for fold_id,(train_index,valid_index) in enumerate(skf.split(X,y)):
    X_tr = X.loc[train_index,:]
    X_val = X.loc[valid_index,:]
    y_tr = y[train_index]
    y_val = y[valid_index]

In [None]:
etc.fit(X_tr,y_tr)
pred_val = etc.predict(X_val)
accuracy_score(y_val,pred_val)

In [None]:
y_pred = etc.predict(X_test)
sub["Survived"] = list(y_pred.astype(int))
sub.to_csv('sub_etc_skf.csv',index=False)
sub_etc_skf = pd.read_csv('../working/sub_etc_skf.csv')
sub_etc_skf