## import modules etc

In [1]:
import multiprocessing
import pandas as pd
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn import neighbors

---
## functions

In [2]:
def select_model_hyperparams(X, y):
    n_cpus = int(round(multiprocessing.cpu_count() * 0.875))
    
    print("using {} CPUs".format(n_cpus))
    
    neighbors_params = {"n_neighbors": [1, 2, 4, 6, 8, 10, 12, 14, 16],
                        "weights": ["uniform", "distance"],
                        "p": [1, 2]
                       }

    clf = GridSearchCV(neighbors.KNeighborsClassifier(),
                      param_grid = neighbors_params,
                      scoring    = "accuracy",
                      cv         = RepeatedKFold(n_splits=5, n_repeats=20, random_state=2),
                      n_jobs     = n_cpus,
                      verbose    = 1)

    return clf.fit(X, y)

## prepare input (X) data for train & test


In [3]:
def prepare_Xdata(data):
    feature_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    X = data[feature_names]
    
    X = pd.get_dummies(X)
    new_feature_names = X.columns
    
    X.fillna(value=X.Age.median(), axis=1, inplace=True)
    
    return (X, new_feature_names)

---
## main

In [4]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
print(train_data.shape)
print(test_data.shape)

(891, 12)
(418, 11)


In [5]:
X_train, feature_names = prepare_Xdata(train_data)
X_test, _ = prepare_Xdata(test_data)

In [6]:
clf = select_model_hyperparams(X_train, train_data.Survived);

using 4 CPUs
Fitting 100 folds for each of 36 candidates, totalling 3600 fits


In [7]:
print("Best average validation score: {}".format(clf.best_score_.round(4)))
for pname, pvalue in clf.best_params_.items():
    print("  Param: {} = {}".format(pname, pvalue))

Best average validation score: 0.7628
  Param: n_neighbors = 8
  Param: p = 1
  Param: weights = distance


In [8]:
predictions = clf.predict(X_test)

In [9]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [10]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [11]:
output.to_csv('submission_02_sc_knn.csv', index=False)