In [1]:
import multiprocessing
import pandas as pd
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn import neighbors
from sklearn import preprocessing

In [2]:
def select_model_hyperparams(X, y):
    n_cpus = int(round(multiprocessing.cpu_count() * 0.875))
    
    print("using {} CPUs".format(n_cpus))
    
    neighbors_params = {"n_neighbors": [1, 2, 4, 6, 8, 10, 12, 14, 16],
                        "weights": ["uniform", "distance"],
                        "p": [1, 2]
                       }

    clf = GridSearchCV(neighbors.KNeighborsClassifier(),
                      param_grid = neighbors_params,
                      scoring    = "accuracy",
                      cv         = RepeatedKFold(n_splits=5, n_repeats=20, random_state=2),
                      n_jobs     = n_cpus,
                      verbose    = 1)

    return clf.fit(X, y)

In [3]:
def prepare_Xdata(data):
    feature_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    X = data[feature_names]
    
    X = pd.get_dummies(X)
    new_feature_names = X.columns
    
    X.fillna(value=X.Age.median(), axis=1, inplace=True)
    
    return (X, new_feature_names)

In [4]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
print(train_data.shape)
print(test_data.shape)

(891, 12)
(418, 11)


In [5]:
X_train, feature_names = prepare_Xdata(train_data)
X_test, _ = prepare_Xdata(test_data)

---
## scaling 2

In [6]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)

X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

In [7]:
X_train_scaled.shape

(891, 10)

In [8]:
X_test_scaled.shape

(418, 10)

In [9]:
clf = select_model_hyperparams(X_train_scaled, train_data.Survived);

using 4 CPUs
Fitting 100 folds for each of 36 candidates, totalling 3600 fits


In [10]:
print("Best average validation score: {}".format(clf.best_score_.round(4)))
for pname, pvalue in clf.best_params_.items():
    print("  Param: {} = {}".format(pname, pvalue))

Best average validation score: 0.8207
  Param: n_neighbors = 14
  Param: p = 2
  Param: weights = uniform


In [11]:
predictions = clf.predict(X_test)

In [12]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [13]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [14]:
output.to_csv('submission_03_sc_knn_scaling2.csv', index=False)