In [28]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.model_selection import StratifiedKFold


In [29]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train['Age'] = train['Age'].fillna(train['Age'].median())
train['Embarked'] = train['Embarked'].fillna('S')
train['Cabin'] = train['Cabin'].fillna('none')

test['Age'] = test['Age'].fillna(test['Age'].median())
test['Embarked'] = test['Embarked'].fillna('S')
test['Cabin'] = test['Cabin'].fillna('none')
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

train['Has_Cabin'] = train['Cabin'] != 'none'
test['Has_Cabin'] = test['Cabin'] != 'none'

In [30]:
train['Title'] = train['Name'].str.extract(r',\s*([^\.]+)\.')

In [31]:
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Has_Cabin']

X_train = train[numeric_features + categorical_features]
y_train = train['Survived']

X_test = test[numeric_features + categorical_features]

# Exploratory Data Analysis

In [32]:
X_train[numeric_features].corr()

Unnamed: 0,Age,SibSp,Parch,Fare
Age,1.0,-0.233296,-0.172482,0.096688
SibSp,-0.233296,1.0,0.414838,0.159651
Parch,-0.172482,0.414838,1.0,0.216225
Fare,0.096688,0.159651,0.216225,1.0


In [33]:
total_males = (train['Sex'] == 'male').sum()
total_females = (train['Sex'] == 'female').sum()

total_male_survivors = ((train['Sex'] == 'male') & (train['Survived'] == 1)).sum()
total_female_survivors = ((train['Sex'] == 'female') & (train['Survived'] == 1)).sum()

count = [total_male_survivors, total_female_survivors]
n_obs = [total_males, total_females]

stat, p_value = proportions_ztest(count, n_obs)
print(f"Male Survivor % : {(total_male_survivors / total_males) * 100:.3f}")
print(f"Female Survivor % : {(total_female_survivors / total_females) * 100:.3f}")

print(f"Z-statistic: {stat:.3f}, p-value: {p_value:.3f}")

Male Survivor % : 18.891
Female Survivor % : 74.204
Z-statistic: -16.219, p-value: 0.000


In [34]:
total_has_cabin = (train['Has_Cabin']).sum()
total_no_cabin = (~train['Has_Cabin']).sum()

total_has_cabin_survivors = ((train['Has_Cabin']) & (train['Survived'] == 1)).sum()
total_no_cabin_survivors = ((~train['Has_Cabin']) & (train['Survived'] == 1)).sum()

n_obs = [total_has_cabin, total_no_cabin]
count = [total_has_cabin_survivors, total_no_cabin_survivors]

stat, p_value = proportions_ztest(count, n_obs)
print(f"Cabin Survivor % : {(total_has_cabin_survivors / total_has_cabin) * 100:.3f}")
print(f"No Cabin Survivor % : {(total_no_cabin_survivors / total_no_cabin) * 100:.3f}")

print(f"Z-statistic: {stat:.3f}, p-value: {p_value:.3f}")


Cabin Survivor % : 66.667
No Cabin Survivor % : 29.985
Z-statistic: 9.460, p-value: 0.000


In [35]:
cont_table = pd.crosstab(train['Pclass'], train['Survived'])
print(cont_table)

chi2, p, dof, expected = chi2_contingency(cont_table)
print(f"Chi2 statistic: {chi2:.3f}")
print(f"P-value: {p:.3f}")

Survived    0    1
Pclass            
1          80  136
2          97   87
3         372  119
Chi2 statistic: 102.889
P-value: 0.000


In [36]:
cont_table = pd.crosstab(train['Embarked'], train['Survived'])
print(cont_table)

chi2, p, dof, expected = chi2_contingency(cont_table)
print(f"Chi2 statistic: {chi2:.3f}")
print(f"P-value: {p:.3f}")

Survived    0    1
Embarked          
C          75   93
Q          47   30
S         427  219
Chi2 statistic: 25.964
P-value: 0.000


In [37]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [38]:
model = LinearSVC(max_iter=10000, random_state=42)
model.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [None]:
# Cross Validation
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', LinearSVC(max_iter=10000, tol=1e-5))
])

param_grid = {
    'svc__C' : np.arange(0.00001, 0.01, 0.00001),
    'svc__loss' : ['hinge', 'squared_hinge']
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

grid_search = GridSearchCV(pipeline, param_grid, cv = skf, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")

best_model = grid_search.best_estimator_


Best parameters: {'svc__C': np.float64(0.00535), 'svc__loss': 'squared_hinge'}
Best cross-validation score: 0.805


In [13]:
X_vif = add_constant(X_train_scaled)
vif_data = pd.DataFrame()
feature_names = ['const'] + list(preprocessor.get_feature_names_out())
vif_data['feature'] = feature_names
vif_data['VIF'] = [variance_inflation_factor(X_vif, i) for i in range(X_vif.shape[1])]

print(vif_data)

                feature        VIF
0                 const  21.635131
1              num__Age   1.224695
2            num__SibSp   1.286829
3            num__Parch   1.339156
4             num__Fare   1.765753
5         cat__Pclass_2   3.457439
6         cat__Pclass_3   5.172412
7         cat__Sex_male   1.138096
8       cat__Embarked_Q   1.494468
9       cat__Embarked_S   1.504161
10  cat__Has_Cabin_True   2.714377


In [14]:
submission = pd.DataFrame({
    "PassengerId" : test['PassengerId'],
    "Survived" : best_model.predict(X_test_scaled)
})

submission.to_csv("data/submission.csv", index = False)