In [None]:
import pandas as pd
import numpy as np  
import seaborn as sns
import matplotlib.pyplot as pyplot
%matplotlib inline




In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
train_df.describe(include=['O'])

In [None]:
train_df.groupby(['Pclass'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby(['Sex'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby(['SibSp'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby(['Parch'], as_index=False)['Survived'].mean()

In [None]:
train_df['Family_Size'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Family_Size'] = train_df['SibSp'] + train_df['Parch'] + 1

In [None]:
train_df.groupby(['Family_Size'], as_index=False)['Survived'].mean()

In [None]:
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6: 'Medium', 7: 'Large',  8: 'Large',  11: 'Large'}
train_df['Family_Size_Grouped'] = train_df['Family_Size'].map(family_map)
test_df['Family_Size_Grouped'] = test_df['Family_Size'].map(family_map)

In [None]:
train_df.groupby(['Family_Size_Grouped'], as_index=False)['Survived'].mean()

In [None]:
train_df.groupby(['Embarked'], as_index=False)['Survived'].mean()

In [None]:
sns.displot(train_df, x='Age', col='Survived', binwidth=10, height=5)

In [None]:
train_df['Age_Cut'] = pd.qcut(train_df['Age'], 8)
test_df['Age_Cut'] = pd.qcut(test_df['Age'], 8)

In [None]:
train_df.groupby(['Age_Cut'], as_index=False)['Survived'].mean()

In [None]:
#string split to find titles (Dr, Countess, Duke. etc)
train_df['Title'] = train_df['Name'].str.split(pat= ',', expand=True)[1].str.split(pat= '.', expand=True)[0].apply(lambda x: x.strip())
test_df['Title'] = test_df['Name'].str.split(pat= ',', expand=True)[1].str.split(pat= '.', expand=True)[0].apply(lambda x: x.strip())

In [None]:
#military - Capt, Col Major
#noble - Jonkheer, Countess, Don, Lady, Sir
#unmarried female - Mlle, Ms, Mme
train_df.groupby(['Title'], as_index=False)['Survived'].mean()

In [None]:
train_df['Title'] = train_df['Title'].replace({
    'Capt': 'Miliary', 
    'Col': 'Miliary',
    'Major': 'Miliary',
    'Jonkheer': 'Noble',
    'the Countess': 'Noble',
    'Don': 'Noble',
    'Lady': 'Noble',
    'Sir': 'Noble',
    'Mlle': 'Noble',
    'Ms': 'Noble',
    'Mme': 'Noble',
    })

test_df['Title'] = test_df['Title'].replace({
    'Capt': 'Miliary', 
    'Col': 'Miliary',
    'Major': 'Miliary',
    'Jonkheer': 'Noble',
    'the Countess': 'Noble',
    'Don': 'Noble',
    'Lady': 'Noble',
    'Sir': 'Noble',
    'Mlle': 'Noble',
    'Ms': 'Noble',
    'Mme': 'Noble',
    })

In [None]:
train_df.groupby(['Title'], as_index=False)['Survived'].agg(['count', 'mean']) 


In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
#float64 is an unfamiliar variable (Fare, Age. etc)
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

In [None]:
#training the model
ohe = OneHotEncoder(sparse_output=False)
ode = OrdinalEncoder
SI = SimpleImputer(strategy='most_frequent')

In [None]:

ode_cols = ['Family_Size_Grouped']
ohe_cols = ['Sex', 'Embarked']

In [None]:
X = train_df.drop(['Survived'], axis=1)
y = train_df['Survived']
X_test = test_df.drop(['Age_Cut'], axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=21)

In [None]:
#pipeline
ordinal_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

In [None]:
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown = 'ignore', sparse_output=False))
])

In [None]:
col_trans = ColumnTransformer(transformers=[
    ('impute', SI, ['Age']),
    ('ord.pipeline', ordinal_pipeline, ode_cols),
    ('ohe.pipeline', ohe_pipeline, ohe_cols),
    ('passthrough', 'passthrough', ['Pclass','Age','Fare'])],
    remainder= 'drop', n_jobs=-1)

In [None]:
rfc = RandomForestClassifier()

In [None]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'min_samples_split': [5, 10, 15],
    'max_depth': [8, 9, 10, 15, 20],
    'min_samples_leaf': [1, 2, 3],
    'criterion': ['gini', 'entropy'],
}

In [None]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinalrfc = make_pipeline(col_trans, CV_rfc)
pipefinalrfc.fit(X_train, y_train)

In [None]:
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
param_grid = {
    'min_samples_split': [5, 10, 15],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],  
}

In [None]:
CV_dtc = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinaldtc = make_pipeline(col_trans, CV_dtc)
pipefinaldtc.fit(X_train, y_train)

In [None]:
print(CV_dtc.best_params_)
print(CV_dtc.best_score_)

In [None]:
knn = KNeighborsClassifier()

In [None]:
param_grid = {
    'min_samples_split': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1,2],
}

In [None]:
CV_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinalknn = make_pipeline(col_trans, CV_knn)
pipefinalknn.fit(X_train, y_train)

In [None]:
print(CV_knn.best_params_)
print(CV_knn.best_score_)

In [None]:
svc = SVC()

In [None]:
param_grid = {
    'C': [100.10, 10.0, 1.00, 0.1, 0.001, 0.001],
    'kernel':['linear', 'poly', 'rbf', 'sigmoid']
}

In [None]:
CV_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinalsvc = make_pipeline(col_trans, CV_svc)
pipefinalsvc.fit(X_train, y_train)

In [None]:
print(CV_svc.best_params_)
print(CV_svc.best_score_)

In [None]:
lr = LogisticRegression()

In [None]:
param_grid = {
    'C': [100.10, 10.0, 1.00, 0.1, 0.001, 0.001],
}

In [None]:
CV_lr = GridSearchCV(estimator=lr, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinallr = make_pipeline(col_trans, CV_lr)
pipefinallr.fit(X_train, y_train)

In [None]:
print(CV_lr.best_params_)
print(CV_lr.best_score_)

In [None]:
gnb =  GaussianNB()

In [None]:
param_grid = {
    'var_smoothing': [0.00000001, 0.000000001, 0.0000000001],
}

In [None]:
CV_gnb = GridSearchCV(estimator=gnb, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinalgnb = make_pipeline(col_trans, CV_gnb)
pipefinalgnb.fit(X_train, y_train)

In [None]:
print(CV_gnb.best_params_)
print(CV_gnb.best_score_)