In [1]:
import multiprocessing
import pandas as pd
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn import neighbors
from sklearn import preprocessing

In [2]:
def select_model_hyperparams(X, y):
    n_cpus = int(round(multiprocessing.cpu_count() * 0.875))
    
    print("using {} CPUs".format(n_cpus))
    
    neighbors_params = {"n_neighbors": [1, 2, 4, 6, 8, 10, 12, 14, 16],
                        "weights": ["uniform", "distance"],
                        "p": [1, 2]
                       }

    clf = GridSearchCV(neighbors.KNeighborsClassifier(),
                      param_grid = neighbors_params,
                      scoring    = "accuracy",
                      cv         = RepeatedKFold(n_splits=5, n_repeats=20, random_state=2),
                      n_jobs     = n_cpus,
                      verbose    = 1)

    return clf.fit(X, y)

In [3]:
def prepare_Xdata(data):
    feature_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
    X = data[feature_names]
    
    X = pd.get_dummies(X)
    new_feature_names = X.columns
    
    X.fillna(value=X.Age.median(), axis=1, inplace=True)
    
    return (X, new_feature_names)

In [4]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
print(train_data.shape)
print(test_data.shape)

(891, 12)
(418, 11)


In [5]:
X_train, feature_names = prepare_Xdata(train_data)
X_test, _ = prepare_Xdata(test_data)

In [6]:
X_train.shape

(891, 10)

In [7]:
X_test.shape

(418, 10)

In [8]:
X_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,0,1,0,1,0
1,3,47.0,1,0,7.0,1,0,0,0,1
2,2,62.0,0,0,9.6875,0,1,0,1,0
3,3,27.0,0,0,8.6625,0,1,0,0,1
4,3,22.0,1,1,12.2875,1,0,0,0,1


---
## scaling

In [9]:
all_data = pd.concat([X_train, X_test], ignore_index=True)

In [10]:
all_data.shape

(1309, 10)

In [11]:
all_data.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.925,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,0,1
4,3,35.0,0,0,8.05,0,1,0,0,1
5,3,28.0,0,0,8.4583,0,1,0,1,0
6,1,54.0,0,0,51.8625,0,1,0,0,1
7,3,2.0,3,1,21.075,0,1,0,0,1
8,3,27.0,0,2,11.1333,1,0,0,0,1
9,2,14.0,1,0,30.0708,1,0,1,0,0


In [12]:
scaler = preprocessing.StandardScaler()
all_data_scaled = pd.DataFrame(scaler.fit_transform(all_data), columns=all_data.columns)
all_data_scaled.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.841916,-0.576088,0.481288,-0.445,-0.503499,-0.743497,0.743497,-0.50977,-0.32204,0.657394
1,-1.546098,0.663229,0.481288,-0.445,0.734591,1.344995,-1.344995,1.96167,-0.32204,-1.521159
2,0.841916,-0.266258,-0.479087,-0.445,-0.490448,1.344995,-1.344995,-0.50977,-0.32204,0.657394
3,-1.546098,0.430857,0.481288,-0.445,0.383015,1.344995,-1.344995,-0.50977,-0.32204,0.657394
4,0.841916,0.430857,-0.479087,-0.445,-0.488031,-0.743497,0.743497,-0.50977,-0.32204,0.657394
5,0.841916,-0.111344,-0.479087,-0.445,-0.480136,-0.743497,0.743497,-0.50977,3.105202,-1.521159
6,-1.546098,1.902546,-0.479087,-0.445,0.359088,-0.743497,0.743497,-0.50977,-0.32204,0.657394
7,0.841916,-2.125233,2.402037,0.710763,-0.236191,-0.743497,0.743497,-0.50977,-0.32204,0.657394
8,0.841916,-0.188801,-0.479087,1.866526,-0.428415,1.344995,-1.344995,-0.50977,-0.32204,0.657394
9,-0.352091,-1.195746,0.481288,-0.445,-0.062257,1.344995,-1.344995,1.96167,-0.32204,-1.521159


In [13]:
X_train.shape[0]

891

In [14]:
X_train_scaled = all_data_scaled.loc[0:X_train.shape[0]-1,:]
X_test_scaled = all_data_scaled.loc[X_train.shape[0]::,:]

In [15]:
X_train_scaled.shape

(891, 10)

In [16]:
X_train_scaled.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.841916,-0.576088,0.481288,-0.445,-0.503499,-0.743497,0.743497,-0.50977,-0.32204,0.657394
1,-1.546098,0.663229,0.481288,-0.445,0.734591,1.344995,-1.344995,1.96167,-0.32204,-1.521159
2,0.841916,-0.266258,-0.479087,-0.445,-0.490448,1.344995,-1.344995,-0.50977,-0.32204,0.657394
3,-1.546098,0.430857,0.481288,-0.445,0.383015,1.344995,-1.344995,-0.50977,-0.32204,0.657394
4,0.841916,0.430857,-0.479087,-0.445,-0.488031,-0.743497,0.743497,-0.50977,-0.32204,0.657394


In [17]:
X_test_scaled.shape

(418, 10)

In [18]:
X_test_scaled.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
891,0.841916,0.392129,-0.479087,-0.445,-0.4923,-0.743497,0.743497,-0.50977,3.105202,-1.521159
892,0.841916,1.360345,0.481288,-0.445,-0.508333,1.344995,-1.344995,-0.50977,-0.32204,0.657394
893,-0.352091,2.522204,-0.479087,-0.445,-0.45637,-0.743497,0.743497,-0.50977,3.105202,-1.521159
894,0.841916,-0.188801,-0.479087,-0.445,-0.476188,-0.743497,0.743497,-0.50977,-0.32204,0.657394
895,0.841916,-0.576088,0.481288,0.710763,-0.406098,1.344995,-1.344995,-0.50977,-0.32204,0.657394


In [19]:
clf = select_model_hyperparams(X_train_scaled, train_data.Survived);

using 4 CPUs
Fitting 100 folds for each of 36 candidates, totalling 3600 fits


In [20]:
print("Best average validation score: {}".format(clf.best_score_.round(3)))
for pname, pvalue in clf.best_params_.items():
    print("  Param: {} = {}".format(pname, pvalue))

Best average validation score: 0.821
  Param: n_neighbors = 14
  Param: p = 1
  Param: weights = uniform


In [21]:
predictions = clf.predict(X_test)

In [22]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [23]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


In [24]:
output.to_csv('submission_03_sc_knn_scaling1.csv', index=False)