In [0]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder


from google.colab import files

### import and check dataset

In [2]:
# i wil upload the dataset via kaggle api so i need to upload my kaggle json first

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nrobxam","key":"406986f9eadde101e9cf2f5f917940c3"}'}

In [0]:
# installing the kaggle api

!pip install -q kaggle

In [0]:
# creating the root folder for kaggle and importing my json there

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [7]:
# downloading the dataset to the virt environment

!kaggle competitions download -c titanic

Downloading train.csv to /content
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 23.0MB/s]
Downloading test.csv to /content
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 28.8MB/s]
Downloading gender_submission.csv to /content
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 3.42MB/s]


In [8]:
!ls

gender_submission.csv  kaggle.json  sample_data  test.csv  train.csv


In [101]:
# making a dataframe for the train sample

df_train = pd.read_csv('train.csv')
print(df_train.shape)
df_train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## preparing the data for the model

In [102]:
# dealing with missing values

print(df_train.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [103]:
# filling missing Age values with the mean 
df_train.Age = df_train.Age.mean()

# filling missing Embarked values with the majority
df_train.Embarked[df_train.Embarked.isnull()] = df_train.groupby('Embarked').count()['PassengerId'][df_train.groupby('Embarked').count()['PassengerId'] == df_train.groupby('Embarked').count()['PassengerId'].max()].index[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [0]:
# dropping useless features

df_train = df_train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [105]:
df_train.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [0]:
# initiating label encoder for sklearn
label = LabelEncoder()
dicts = {}

# initiating the labels for sex column
label.fit(df_train.Sex.drop_duplicates())
dicts['Sex'] = list(label.classes_)
# coding the sex column values
df_train.Sex = label.transform(df_train.Sex)

# initiating the labels for Embarked column
label.fit(df_train.Embarked.drop_duplicates())
dicts['Embarked'] = list(label.classes_)
# coding the sex column values
df_train.Embarked = label.transform(df_train.Embarked)

In [0]:
# initializing the target variable and dropping it from features dataframe
y = df_train.Survived
X = df_train.drop(['Survived'], axis=1)

In [108]:
print(x_train.columns)
print(x_train.shape)

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')
(891, 7)


### searching for the optimal parameters on the train sample

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

In [121]:
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(668, 7) (223, 7) (668,) (223,)


In [122]:
# initializing the dictionaries for the parameters that will be passed to GridSearchCV method

## for the number of k neighbors
k = list(range(1, 60, 2))

## for the weights
weights_options = ['uniform', 'distance']

## for the algorithms applied 
algos = ['ball_tree', 'kd_tree', 'brute']

## leaf size (since i've initiated BallTree and KDTree algorithms)
leaves = list(np.arange(10, 110, 10))

## for the metrics
metric_options = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']

## for the parameters of the metrics
#metric_params=metric_param_options

# initializing the grid

params_grid = dict(n_neighbors=k, weights=weights_options, algorithm=algos, leaf_size=leaves, metric=metric_options, )

# initializing the grid search with 10 cross_validation splits

model_titanic = KNeighborsClassifier() 

grid = GridSearchCV(model_titanic, params_grid, cv=10, scoring='accuracy')

# training the model
grid.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                         'metric': ['euclidean', 'manhattan', 'chebyshev',
                                    'minkowski'],
                         'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
                                         23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
                                         43, 45, 47, 49, 51, 53, 55, 57, 59],
                         'weights': ['uniform'

In [123]:
print(f'best parameters: {grid.best_params_},\nbest accuracy score: {grid.best_score_},\nbest estimator: {grid.best_estimator_}')

best parameters: {'algorithm': 'ball_tree', 'leaf_size': 20, 'metric': 'manhattan', 'n_neighbors': 31, 'weights': 'distance'},
best accuracy score: 0.7709580838323353,
best estimator: KNeighborsClassifier(algorithm='ball_tree', leaf_size=20, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=31, p=2,
                     weights='distance')


### preparing the test sample

In [129]:
df_test = pd.read_csv('test.csv')
df_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [130]:
# preparing the test sample

df_test = pd.read_csv('test.csv')
df_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [131]:
# checking the null values
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [132]:
# proccesing the test features

X_test = df_test

X_test.Age[X_test.Age.isnull()] = X_test.Age.mean()
X_test.Embarked[X_test.Embarked.isnull()] = X_test.groupby('Embarked').count()['PassengerId'][X_test.groupby('Embarked').count()['PassengerId'] == X_test.groupby('Embarked').count()['PassengerId'].max()].index[0]

# got a single missing value in column Fare
X_test.Fare[X_test.Fare.isnull()] = X_test.Fare.median()

result = pd.DataFrame(df_test.PassengerId)
X_test = df_test.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

label = LabelEncoder()
dicts = {}

label.fit(X_test.Sex.drop_duplicates())
dicts['Sex'] = list(label.classes_)

label.fit(X_test.Sex.drop_duplicates())
dicts['Sex'] = list(label.classes_)
X_test.Sex = label.transform(X_test.Sex)


label.fit(X_test.Embarked.drop_duplicates())
dicts['Embarked'] = list(label.classes_)
X_test.Embarked = label.transform(X_test.Embarked)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [134]:
# duplicating the df_test to save PassengerId columns

print(df_test.columns)
print(X_test.columns)

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')


### predict & submit

In [135]:
# launching prediction based on best grid parameters

predictions = grid.predict(X_test)
predictions

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [137]:
# makeing the submission dataframe

submit = pd.DataFrame(list(zip(df_test.PassengerId, predictions)), columns = ['PassengerId', 'Survived'])
submit.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [0]:
# saving the submission dataframe to csv
submit.to_csv('submission.csv', sep=',', index=False)

In [139]:
!ls

gender_submission.csv  sample_data     test.csv
kaggle.json	       submission.csv  train.csv


In [141]:
!kaggle competitions submit titanic -f submission.csv -m "190813_1"

100% 2.77k/2.77k [00:00<00:00, 10.5kB/s]
Successfully submitted to Titanic: Machine Learning from Disaster

my honorable place at kaggle : 10525/11365 ,  top 93% ))
