In [None]:
# Base 
# -----------------------------------
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Missing Values 
# -----------------------------------
# !pip install missingno
import missingno as msno

# Models 
# -----------------------------------
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

# Metrics & Evaluation
# -----------------------------------
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, classification_report

# Configuration
# -----------------------------------
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format

- <code>survival</code> - **Target** (0 = No, 1 = Yes)
- <code>pclass</code> - **Ticket class** (1 = 1st, 2 = 2nd, 3 = 3rd)
- <code>sex</code> - **Sex**
- <code>Age</code> - **Age in years**
- <code>sibsp</code> - **# of siblings / spouses aboard the Titanic**	
- <code>parch</code> - **# of parents / children aboard the Titanic**	
- <code>ticket</code> - **Ticket number**
- <code>fare</code> - **Passenger fare**	
- <code>cabin</code> - **Cabin number**
- <code>embarked</code> - **Port of Embarkation**	(C = Cherbourg, Q = Queenstown, S = Southampton)

In [None]:
tr = pd.read_csv("../input/titanic/train.csv")
ts = pd.read_csv("../input/titanic/test.csv")

df = tr.append(ts)


for i in ["object", "float", "integer", "bool"]:
    print(i.capitalize()+" Variables:", "\n", "# of Variables:", 
      len(df.select_dtypes(i).columns), "\n", 
      df.select_dtypes(i).columns.tolist(), "\n")

df.shape, tr.shape, ts.shape

In [None]:
df.head()

In [None]:
df.describe([0.01, 0.05, 0.10, 0.80, 0.90, 0.95, 0.99]).T

In [None]:
print(df.Survived.value_counts()), print(df.Survived.value_counts() / df.shape[0])

In [None]:
df.isnull().values.any()

In [None]:
msno.heatmap(df)
plt.show()

In [None]:
def missing_values(data, plot = False, target = "SalePrice"):
    
    mst = pd.DataFrame({"Num_Missing":df.isnull().sum(), "Missing_Ratio":df.isnull().sum() / df.shape[0]}).sort_values("Num_Missing", ascending = False)
    mst["DataTypes"] = df[mst.index].dtypes.values
    mst = mst[mst.Num_Missing > 0].reset_index().rename({"index":"Feature"}, axis = 1)
    mst = mst[mst.Feature != target]
    
    print("Number of Variables include Missing Values:", mst.shape[0], "\n")
    
    if mst[mst.Missing_Ratio > 0.99].shape[0] > 0:  
        print("Full Missing Variables:",mst[mst.Missing_Ratio > 0.99].Feature.tolist())
        data.drop(mst[mst.Missing_Ratio > 0.99].Feature.tolist(), axis = 1, inplace = True)

        print("Full missing variables are deleted!", "\n")

    if plot:
        plt.figure(figsize = (25, 8))    
        p = sns.barplot(mst.Feature, mst.Missing_Ratio)
        for rotate in p.get_xticklabels():
            rotate.set_rotation(90)
                        
    print(mst, "\n")        
    
missing_values(df, plot = True, target = "Survived")

In [None]:
def missing_vs_target(dataframe, target):
    temp_df = dataframe.copy()
    
    variable_with_na = temp_df.drop(target, axis = 1).isnull().sum().reset_index()
    variable_with_na = variable_with_na[variable_with_na[0] > 0]["index"].tolist()

    for variable in variable_with_na:
        temp_df[variable + '_NA_FLAG'] = np.where(temp_df[variable].isnull(), 1, 0)

    flags_na = temp_df.loc[:, temp_df.columns.str.contains("_NA_")].columns

    for variable in flags_na:
        print(pd.DataFrame({"TARGET_MEAN": temp_df.groupby(variable)[target].mean()}), end="\n\n\n")

        
missing_vs_target(df, "Survived")

In [None]:
def num_plot(data, cat_length = 16, remove = ["Id"], hist_bins = 12, figsize = (20,4)):
    
    num_cols = [col for col in data.columns if data[col].dtypes != "O" 
                and len(data[col].unique()) >= cat_length]
    
    if len(remove) > 0:
        num_cols = list(set(num_cols).difference(remove))
            
    for i in num_cols:
        fig, axes = plt.subplots(1, 3, figsize = figsize)
        data.hist(str(i), bins = hist_bins, ax=axes[0])
        data.boxplot(str(i),  ax=axes[1], vert=False);
        try: 
            sns.kdeplot(np.array(data[str(i)]))
        except: ValueError
        
        axes[1].set_yticklabels([])
        axes[1].set_yticks([])
        axes[0].set_title(i + " | Histogram")
        axes[1].set_title(i + " | Boxplot")
        axes[2].set_title(i + " | Density")
        plt.show()
        
        
num_plot(df, cat_length = 16, remove = ["PassengerId"], hist_bins = 10, figsize = (20,4))

In [None]:
def cat_eda(data, cat_length, target = "Survived"):  
    dataframe = data.copy()
    
    #if len(ordinal_variable) > 0:
    #    dataframe.drop(ordinal_variable, axis = 1, inplace = True)
        
    more_cat_cols = [col for col in dataframe.columns if len(dataframe[col].unique()) < cat_length]
    
    num_cols = [col for col in data.columns if data[col].dtypes != "O" 
                and len(data[col].unique()) >= cat_length]
    
    for i in more_cat_cols: 
        print(i, ":", len(dataframe[i].value_counts()), "Unique Category -", str(dataframe[i].dtype))
        print(pd.DataFrame({"COUNT": dataframe[i].value_counts(),
                            "RATIO": dataframe[i].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(i)[target].mean(),
                            "TARGET_COUNT": dataframe.groupby(i)[target].count(),
                            "TARGET_STD": dataframe.groupby(i)[target].std()}), end="\n\n\n")
    
    print("# DTYPES -----------------------------")
    print("Object Variables:",dataframe[more_cat_cols].select_dtypes("object").columns.tolist(), "\n")
    print("Integer Variables:",dataframe[more_cat_cols].select_dtypes("integer").columns.tolist(), "\n")
    print("Float Variables:",dataframe[more_cat_cols].select_dtypes("float").columns.tolist(), "\n")
    
    print("# OTHER -----------------------------")
    print("More than " + str(cat_length)+" categories:", dataframe.drop(more_cat_cols + num_cols, axis = 1).columns, "\n")
    print(dataframe.drop(more_cat_cols + num_cols, axis = 1).nunique())

cat_eda(df, cat_length=10, target = "Survived")


In [None]:
df[(df.Embarked.isnull()) | (df.Ticket == "113572")]

In [None]:
df[(df.Fare.isnull())]

In [None]:
# Mode: S
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

# All data median = 14, Pclass = 3 median = 8
df["Fare"] = df["Fare"].fillna(df.groupby("Pclass")["Fare"].transform("median"))


# Title
df['NEW_TITLE'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# Fill NA with Title
df["Age"] = df["Age"].fillna(df.groupby("NEW_TITLE")["Age"].transform("mean"))

In [None]:
# Label
df["NEW_CABIN_BOOL"] = df["Cabin"].notnull().astype('int')

# Family
df.loc[((df['SibSp'] + df['Parch']) > 0), "NEW_IS_ALONE"] = 0
df.loc[((df['SibSp'] + df['Parch']) == 0), "NEW_IS_ALONE"] = 1

# NUMERIC TO CATEGORICAL
df.loc[(df['Age'] < 18), 'NEW_AGE_CAT'] = 'young'
df.loc[(df['Age'] >= 18) & (df['Age'] < 56), 'NEW_AGE_CAT'] = 'mature'
df.loc[(df['Age'] >= 56), 'NEW_AGE_CAT'] = 'senior'

df["AGECAT2"] = pd.cut(df.Age, 8, labels = np.arange(1,9,1)).astype(int)

# INTERACTIONS

df["FAMILY_SIZE"] = df["SibSp"] + df["Parch"] + 1
df["NEW_AGExPCLASS"] = df["Age"] * df["Pclass"]

df.loc[(df['Sex'] == 'male') & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngmale'
df.loc[(df['Sex'] == 'male') & ((df['Age'] > 21) & (df['Age']) < 50), 'NEW_SEX_CAT'] = 'maturemale'
df.loc[(df['Sex'] == 'male') & (df['Age'] > 50), 'NEW_SEX_CAT'] = 'seniormale'
df.loc[(df['Sex'] == 'female') & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngfemale'
df.loc[(df['Sex'] == 'female') & ((df['Age'] > 21) & (df['Age']) < 50), 'NEW_SEX_CAT'] = 'maturefemale'
df.loc[(df['Sex'] == 'female') & (df['Age'] > 50), 'NEW_SEX_CAT'] = 'seniorfemale'


# PERSONNEL
df["NEW_CABIN_BOOL"] = df["Cabin"].notnull().astype('int')

# CABIN CODE
cabin = []
import re
for i in range(0,len(df)):
    regex = re.compile('[A-Z]')
    try:
        temp = regex.findall(df.Cabin.iloc[i])[0]   
    except:
        temp = "Unknown"
    cabin.append(temp)
df["CabinCode"] = cabin


# FARE
df["Fare_0"] = np.where(df.Fare < 1, 1, 0)

df["FareClass"]= pd.cut(df.Fare, 10, labels = np.arange(1,11,1)).astype(int)

# Pclass / Fare
df["NewClass"] = df.Pclass / (df.Fare + 1)


# DROP
df.drop(["Name"], axis = 1, inplace = True)


df["Cabin"] = df.Cabin.fillna("Unknown")

In [None]:
# LABEL ENCODER
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

for i in ["Sex", "Ticket", "Cabin", "CabinCode"]:
    df[i] = le.fit_transform(df[i])
    
# ONE-HOT ENCODER
df = pd.get_dummies(df, columns=["Embarked", "Pclass","NEW_TITLE", "NEW_AGE_CAT", "NEW_SEX_CAT"], drop_first=True)

In [None]:
cat_eda(df, cat_length=10, target = "Survived")

In [None]:
# Train Test Split After Data Manipulation
train = df.iloc[:891]
train["Survived"] = train["Survived"].astype("int")
train.drop("PassengerId", axis = 1, inplace = True)

test = df.iloc[891:].drop("Survived", axis = 1)

X_train = train.drop("Survived", axis = 1)
y_train = train.Survived


# All Models & Train Validation Scores
models = [('LR', LogisticRegression()),
          ("NBAYES", GaussianNB()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ("BAGGING", BaggingClassifier()),
          ('RF', RandomForestClassifier()),
          ('SVM', SVC(gamma='auto')),
          ("ADAB",AdaBoostClassifier()),
          ('GBM', GradientBoostingClassifier()),
          ("XGB", XGBClassifier()),
          ("LGBM", LGBMClassifier())]

# evaluate each model in turn
results = []
names = []

print("# Algorithm Comparison")
print("-----------------------------------------")
for name, model in models:
    kfold = KFold(n_splits=10, random_state=123456)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
print("\n")    

# boxplot algorithm comparison
fig = plt.figure(figsize=(15, 10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
gbm = GradientBoostingClassifier(
    random_state=1
)
gbm

In [None]:
params = {
    #"ccp_alpha":np.arange(0.0, 1.1, 0.1),
    "max_depth":np.arange(3, 16, 1),
    "max_leaf_nodes":np.arange(2, 11, 1)
}


from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

gs = GridSearchCV(gbm, params,  n_jobs=-1, cv = 10, scoring = "accuracy")
gs_gbm = gs.fit(X_train, y_train)

gs_gbm

In [None]:
print(classification_report(y_train, gs_gbm.predict(X_train)))

In [None]:
sub = pd.DataFrame({"PassengerId":test.PassengerId, "Survived":gs_gbm.predict(test.drop("PassengerId", axis = 1))})
sub.to_csv("submission.csv", index = None)

In [None]:
sub.head(20)