In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import random

In [168]:
train = pd.read_csv("train.csv", index_col="PassengerId")
train_no_survived = train.drop("Survived", axis=1);

submission_dataset = pd.read_csv("test.csv") 

dataset = train_no_survived.append(submission_dataset.drop("PassengerId", axis=1))

title_list = ["Mr.", "Master.", "Mrs.", "Miss."]
    
    # http://grammarist.com/usage/mr-mrs-ms-and-miss/
def map_names(name):
    for title in title_list:
        if title in name:
            return title
    return "Unknown"
            
dataset["Name"] = dataset["Name"].map(map_names)
    
def slice_and_get_mean_age_by_name(name):
    return dataset.loc[dataset["Name"] == name, "Age"].mean()
    
age_means = {k:slice_and_get_mean_age_by_name(k) for k in ["Mr.", "Master.", "Mrs.", "Miss.", "Unknown"]}


def fill_age(row):
    if np.isnan(row[3]):
        row[3] = age_means[row[1]]
    return row

dataset = dataset.apply(fill_age, axis=1)

dataset["Fare"] = dataset["Fare"].fillna(dataset["Fare"].mean())

cabin_list = list("ABCDEFTG")
def map_cabin(cabin):
    return str(cabin)
    
dataset["Cabin"] = dataset["Cabin"].fillna("")
    
towns = list("CQS")
dataset["Embarked"] = dataset["Embarked"].fillna(random.choice(towns))

categorical_columns = ["Embarked", "Sex", "Cabin", "Name"];

encoders = {col: LabelEncoder().fit(dataset[col]) for col in categorical_columns}

def encode_categorical(data, columns, encoders):
    return pd.DataFrame({col: encoders[col].transform(data[col]) for col in columns},
                        index = data.index)

encoded = encode_categorical(dataset, categorical_columns , encoders)

one_hot_encoder = OneHotEncoder().fit(encoded)


train_data_cleaned = dataset[0:-418]

test_data_cleaned_for_submission = dataset.tail(418)


In [181]:
from sklearn.model_selection import train_test_split
train_X , test_X, train_y, test_y = train_test_split(train_data_cleaned, train["Survived"] , random_state=26)

one_hot_X_train = one_hot_encoder.transform(encode_categorical(train_X[categorical_columns], categorical_columns, encoders))
one_hot_X_test = one_hot_encoder.transform(encode_categorical(test_X[categorical_columns], categorical_columns, encoders))

print("{} {}".format(train_X[numeric_columns].shape, test_X[numeric_columns].shape))
print("{} {}".format(one_hot_X_train.shape, one_hot_X_test.shape))

numeric_columns = ["Pclass", "Age", "Fare", "SibSp", "Parch"]

# train_X["Family"] = train_X["SibSp"] + train_X["Parch"]
# train_X = train_X.drop(["SibSp", "Parch"], axis=1)

# test_X["Family"] = test_X["SibSp"] + test_X["Parch"]
# test_X = test_X.drop(["SibSp", "Parch"], axis=1)




train_X = np.concatenate([train_X[numeric_columns], one_hot_X_train.toarray()], axis=1);
test_X = np.concatenate([test_X[numeric_columns], one_hot_X_test.toarray()], axis=1)

print("{} {}".format(train_X.shape, test_X.shape))



(668, 3) (223, 3)
(668, 197) (223, 197)
(668, 202) (223, 202)


In [197]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_X, train_y)

def print_score(train, test):
    print("Train score is {}".format(train))
    print("Test score is {}".format(test))
    print("")

print_score(model.score(train_X, train_y), model.score(test_X, test_y))
#alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
alphas = [100]
for alpha in alphas:
    
    from sklearn.linear_model import RidgeClassifier
    model2 = RidgeClassifier(alpha)
    model2.fit(train_X, train_y)
    print("using alpha = {}".format(alpha))
    print("Train score is {}".format(model2.score(train_X, train_y)))
    print("Test score is {}".format(model2.score(test_X, test_y)))
    print("")

Train score is 0.8562874251497006
Test score is 0.8340807174887892

using alpha = 100
Train score is 0.8233532934131736
Test score is 0.8161434977578476



In [176]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(train_X, train_y)

print_score(knn_model.score(train_X, train_y), knn_model.score(test_X, test_y))

Train score is 0.8098802395209581
Test score is 0.7040358744394619



In [177]:
from sklearn.ensemble import RandomForestClassifier

random_f_model = RandomForestClassifier(random_state=0)
random_f_model.fit(train_X, train_y)

print_score(random_f_model.score(train_X, train_y), random_f_model.score(test_X, test_y))

Train score is 0.9730538922155688
Test score is 0.7982062780269058



In [194]:
one_hot_test_data = one_hot_encoder.transform(encode_categorical(test_data_cleaned_for_submission[categorical_columns], categorical_columns, encoders))

test_data_processed = np.concatenate([test_data_cleaned_for_submission[numeric_columns], one_hot_test_data.toarray()], axis=1);

#prediction = random_f_model.predict(test_data_processed)
#prediction = model2.predict(test_data_processed)
prediction = model.predict(test_data_processed)

def save(prediction):
    df = pd.DataFrame(data={"Survived": prediction}, index=submission_dataset["PassengerId"])
    df.to_csv("with_data_cleaning.csv")

save(prediction)

In [72]:
# dataset = pd.read_csv("train.csv", index_col="PassengerId")


# print("Initial dataset shape:{}".format(dataset.shape))
# #print(dataset.head())
# def pipe(dataset):
    
#     y = dataset["Survived"]
#     dataset = dataset.drop(["Survived"], axis=1)
#     x = pretify(dataset, False)
#     return (x,y)

# ohe = OneHotEncoder()
# le = LabelEncoder()



# def pretify(dataset, prod=True):
#     title_list = ["Mr.", "Master.", "Mrs.", "Miss."]
    
#     # http://grammarist.com/usage/mr-mrs-ms-and-miss/
#     def map_names(name):
#         for title in title_list:
#             if title in name:
#                 return title
#         return "Unknown"
            
#     dataset["Name"] = dataset["Name"].map(map_names)
    
#     print(dataset["Name"].value_counts())
#     def slice_and_get_mean_age_by_name(name):
#         return dataset.loc[dataset["Name"] == name, "Age"].mean()
    
#     age_means = {k:slice_and_get_mean_age_by_name(k) for k in ["Mr.", "Master.", "Mrs.", "Miss.", "Unknown"]}
    
#     def fill_age(row):
#         if np.isnan(row[3]):
#             row[3] = age_means[row[1]]
#         return row
        
#     dataset = dataset.apply(fill_age, axis=1)
    
#     x = dataset.drop(["Cabin", "Embarked","Sex", "Ticket", "Name"], axis=1)
    

    
#     ix=[i for i in dataset.columns if i in ["Embarked", "Sex", "Cabin", "Name"]]
    
#     x_cat = dataset[ix];
    
#     cabin_list = list("ABCDEFTG")
#     def map_cabin(cabin):
#         return str(cabin)
    
#     x_cat["Cabin"] = x_cat["Cabin"].map(map_cabin)
    
#     towns = list("CQS")
#     x_cat["Embarked"] = x_cat["Embarked"].fillna(random.choice(towns))
    
#     x_cat = x_cat.apply(le.fit_transform)
    
# #     if not prod:
# #         x_cat.apply((lambda col: le.fit(col)))
        
# #     x_cat = x_cat.apply(le.transform)
    
#     if not prod:
#         ohe.fit(x_cat)
    
#     x_cat_encoded = ohe.transform(x_cat)
#     x = np.column_stack((x, x_cat_encoded.toarray()))
    
    
#     return x

# dataset_X, dataset_y = pipe(dataset)
# print(dataset_X.shape)