In [1091]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
sns.set() #make the graphs prettier

In [1092]:
test = pd.read_csv("./input/test.csv")
train = pd.read_csv("./input/train.csv")

data_cleaner = [train, test]

In [1093]:
age_ref = pd.DataFrame(data=[train.groupby('Pclass')['Age'].mean()],columns=train['Pclass'].unique())
age_ref

Unnamed: 0,3,1,2
Age,25.14062,38.233441,29.87763


In [1094]:
def fill_age(pclass,age):
    if pd.isnull(age):
        return float(age_ref[pclass])
    else:
        return age

for data in data_cleaner:
    data['Age'] = train.apply(lambda x: fill_age(x['Pclass'],x['Age']), axis=1)

In [1095]:
def fill_fare(fare):
    if pd.isnull(fare):
        return train['Fare'].mean()
    else:
        return fare
    
def fill_embark(embark):
    if pd.isnull(embark):
        return train['Embarked'].mode().iloc[0]
    else:
        return embark
    
for data in data_cleaner:
    data['Fare'] = train.apply(lambda x: fill_fare(x['Fare']), axis=1)
    data['Embarked'] = train.apply(lambda x: fill_embark(x['Embarked']), axis=1)

In [1096]:
def setCabin(val):
    result = 0
    if pd.isna(val): result = 0
    else: result = 1
    return result

train["HasCabin"] = train["Cabin"].apply(setCabin)
test["HasCabin"] = test["Cabin"].apply(setCabin)

In [1097]:
for data in data_cleaner:
    data.drop(['Cabin'],axis=1,inplace=True)

In [1098]:
title_list = list()
for data in data_cleaner:
    for title in data['Name']:
        title = title.split('.')[0].split(',')[1]
        title_list.append(title)
    
    data['Title'] = title_list
    title_list = list()

In [1099]:
train['Title'] = train['Title'].replace([ ' Don', ' Rev', ' Dr', ' Mme',' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
       ' the Countess', ' Jonkheer'], 'Others')
train['Title'].value_counts()

 Mr        517
 Miss      182
 Mrs       125
 Master     40
Others      27
Name: Title, dtype: int64

In [1100]:
test['Title'] = test['Title'].replace([ ' Don', ' Rev', ' Dr', ' Mme',' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
       ' the Countess', ' Jonkheer',' Dona'], 'Others')
test['Title'].value_counts()

 Mr        240
 Miss       78
 Mrs        72
 Master     21
Others       7
Name: Title, dtype: int64

In [1101]:
def get_size(df):
    if df['SibSp'] + df['Parch'] + 1 == 1:
        return 'Single'
    if df['SibSp'] + df['Parch'] + 1 > 1:
        return 'Small'
    if df['SibSp'] + df['Parch'] + 1 > 4:
        return 'Big'
    
for data in data_cleaner:
    data['FamilySize'] = data.apply(get_size,axis=1)

for data in data_cleaner:
    data['IsAlone'] = 1 
    data['IsAlone'].loc[data['FamilySize'] != 'Single'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [1102]:
# train_age_yes = train[train["Age"].notna()]
# train_age_no = train[train["Age"].isna()]

In [1103]:
# train_age_yes.corr()

In [1104]:
# x = train_age_yes[["Pclass", "IsAlone"]]
# y = train_age_yes["Age"]

In [1105]:
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import KFold, cross_val_score
# from sklearn.metrics import mean_squared_error, make_scorer
# score = make_scorer(mean_squared_error)
# k_fold = KFold(n_splits=20, shuffle=True, random_state=42)

In [1106]:
# lr = LinearRegression()
# lr.fit(x, y)
# scoring = "mean_squared_error"
# score = cross_val_score(lr, x, y, cv=k_fold, n_jobs=1, scoring=score)
# print("LinearRegression: {0: .4f}".format(round(np.mean(np.sqrt(score)))))

In [1107]:
# y_predict = lr.predict(train_age_no[["Pclass", "IsAlone"]])
# train_age_no["Age"] = y_predict
# train = pd.concat([train_age_yes, train_age_no])

In [1108]:
# test_age_yes = test[test["Age"].notna()]
# test_age_no = test[test["Age"].isna()]

In [1109]:
# x_test = test_age_no[["Pclass", "IsAlone"]]
# y_predict = lr.predict(x_test)
# test_age_no["Age"] = y_predict
# test = pd.concat([test_age_yes, test_age_no])

In [1110]:
train["AgeBand"] = pd.cut(train["Age"], 4)
train[["AgeBand", "Survived"]].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 20.315]",0.458101
1,"(20.315, 40.21]",0.364769
2,"(40.21, 60.105]",0.390625
3,"(60.105, 80.0]",0.227273


In [1111]:
# def setAge(val):
#     if val < 13.823: result = 3
#     elif val < 35.882: result = 2
#     elif val < 57.941: result = 1
#     else: result = 0
#     return result

# train["Age"] = train["Age"].apply(setAge)
# test["Age"] = test["Age"].apply(setAge)
# train = train.drop(columns=["AgeBand"], axis=1)

In [1112]:
def setAge(val):
    if val < 20.315: result = 3
    elif val < 40.21: result = 2
    elif val < 60.105: result = 1
    else: result = 0
    return result

train["Age"] = train["Age"].apply(setAge)
test["Age"] = test["Age"].apply(setAge)
train = train.drop(columns=["AgeBand"], axis=1)

In [1113]:
train["FareBand"] = pd.cut(train["Fare"], 4)
train[["FareBand", "Survived"]].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.512, 128.082]",0.368113
1,"(128.082, 256.165]",0.724138
2,"(256.165, 384.247]",0.666667
3,"(384.247, 512.329]",1.0


In [1114]:
# def setFare(val):
#     result = 0
#     if val < 170.776: result = 0
#     elif val < 341.553: result = 1
#     else: result = 2
#     return result

# train["Fare"] = train["Fare"].apply(setFare)
# test["Fare"] = test["Fare"].apply(setFare)

# train = train.drop(columns=["FareBand"], axis=1)

In [1115]:
def setFare(val):
    result = 0
    if val < 128.082: result = 0
    elif val < 256.165: result = 1
    elif val < 384.247: result = 2
    else: result = 3
    return result

train["Fare"] = train["Fare"].apply(setFare)
test["Fare"] = test["Fare"].apply(setFare)

train = train.drop(columns=["FareBand"], axis=1)

In [1116]:
sex = pd.get_dummies(train['Sex'],drop_first=True)
embark = pd.get_dummies(train['Embarked'],drop_first=True)
title = pd.get_dummies(train['Title'],drop_first=True)
Pclass = pd.get_dummies(train['Pclass'],drop_first=True)
FamilySize = pd.get_dummies(train['FamilySize'],drop_first=True)

sex2 = pd.get_dummies(test['Sex'],drop_first=True)
embark2 = pd.get_dummies(test['Embarked'],drop_first=True)
title2 = pd.get_dummies(test['Title'],drop_first=True)
Pclass2 = pd.get_dummies(test['Pclass'],drop_first=True)
FamilySize2 = pd.get_dummies(test['FamilySize'],drop_first=True)

train.drop(['Sex','Embarked','Name','Ticket','PassengerId','Title','FamilySize'],axis=1,inplace=True)
test.drop(['Sex','Embarked','Name','Ticket','PassengerId','Title','FamilySize'],axis=1,inplace=True)
# train.drop(['Sex','Embarked','Name','Ticket','PassengerId','Title','FamilySize', 'SibSp', "Parch"],axis=1,inplace=True)
# test.drop(['Sex','Embarked','Name','Ticket','PassengerId','Title','FamilySize', 'SibSp', "Parch"],axis=1,inplace=True)


train = pd.concat([sex,embark,train,title,FamilySize],axis=1)
test = pd.concat([sex2,embark2,test,title2,FamilySize2],axis=1)

In [1117]:
train.corr()

Unnamed: 0,male,Q,S,Survived,Pclass,Age,SibSp,Parch,Fare,HasCabin,IsAlone,Miss,Mr,Mrs,Others,Small
male,1.0,-0.074115,0.119224,-0.543351,0.1319,-0.083042,-0.114631,-0.245489,-0.114771,-0.140391,0.303646,-0.686808,0.867334,-0.5476,0.034471,-0.303646
Q,-0.074115,1.0,-0.499421,0.00365,0.221009,0.014369,-0.026354,-0.081228,-0.058637,-0.129572,0.086464,0.171117,-0.078338,-0.089739,-0.007767,-0.086464
S,0.119224,-0.499421,1.0,-0.149683,0.074053,0.003492,0.068734,0.060814,-0.104624,-0.101139,0.029074,-0.13065,0.11287,0.002689,-0.052433,-0.029074
Survived,-0.543351,0.00365,-0.149683,1.0,-0.338481,0.066946,-0.035322,0.081629,0.147466,0.316912,-0.203367,0.327093,-0.549199,0.33904,0.02203,0.203367
Pclass,0.1319,0.221009,0.074053,-0.338481,1.0,0.2922,0.083081,0.018443,-0.29858,-0.725541,0.135207,-0.000576,0.142698,-0.149209,-0.206333,-0.135207
Age,-0.083042,0.014369,0.003492,0.066946,0.2922,1.0,0.200889,0.149257,-0.025158,-0.199013,-0.155776,0.232356,-0.153921,-0.145533,-0.159431,0.155776
SibSp,-0.114631,-0.026354,0.068734,-0.035322,0.083081,0.200889,1.0,0.414838,0.040996,-0.04046,-0.584471,0.087932,-0.250489,0.063407,-0.036364,0.584471
Parch,-0.245489,-0.081228,0.060814,0.081629,0.018443,0.149257,0.414838,1.0,0.141616,0.036987,-0.583398,0.105567,-0.333905,0.225852,-0.06748,0.583398
Fare,-0.114771,-0.058637,-0.104624,0.147466,-0.29858,-0.025158,0.040996,0.141616,1.0,0.259115,-0.094555,0.120934,-0.100536,0.021799,-0.011459,0.094555
HasCabin,-0.140391,-0.129572,-0.101139,0.316912,-0.725541,-0.199013,-0.04046,0.036987,0.259115,1.0,-0.158029,0.035314,-0.137319,0.1183,0.106246,0.158029


In [1118]:
X = train.drop('Survived',axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [1119]:
scaler = MinMaxScaler()

scaler.fit(X_train)

scaler.transform(X_train)
scaler.transform(X_test)
scaler.transform(test)

array([[1., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 1.],
       [1., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [1120]:
logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)



In [1121]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb

In [1122]:
from sklearn.model_selection import KFold,  cross_val_score
k_fold = KFold(n_splits=20, shuffle=True, random_state=42)

In [1123]:
scoring = 'accuracy'
score = cross_val_score(logistic_model, X, y, cv=k_fold, scoring=scoring)
print(score.mean())

0.8215404040404041




In [1124]:
print(classification_report(y_test,y_pred))
print('\n')
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86        99
           1       0.86      0.75      0.80        80

    accuracy                           0.83       179
   macro avg       0.84      0.82      0.83       179
weighted avg       0.83      0.83      0.83       179



[[89 10]
 [20 60]]


In [1125]:
predictions = logistic_model.predict(test)
pred_list = [int(x) for x in predictions]

test2 = pd.read_csv("./input/test.csv")
output = pd.DataFrame({'PassengerId': test2['PassengerId'], 'Survived': pred_list})
# output.to_csv('XGB 0.83 HasCabin 03.29.csv', index=False)