# Titanic DataSet

Пояснения по некоторым полям:
- PassengerId — идентификатор пассажира
- Survival — поле в котором указано спасся человек (1) или нет (0)
- Pclass — содержит социально-экономический статус:
-- высокий
-- средний
-- низкий
- Name — имя пассажира
- Sex — пол пассажира
- Age — возраст
- SibSp — содержит информацию о количестве родственников 2-го порядка (муж, жена, братья, сетры)
- Parch — содержит информацию о количестве родственников на борту 1-го порядка (мать, отец, дети)
- Ticket — номер билета
- Fare — цена билета
- Cabin — каюта
- Embarked — порт посадки
-- C — Cherbourg
-- Q — Queenstown
-- S — Southampton

In [2]:
import pandas as pd
data = pd.read_csv('titanic_train.csv')

In [3]:
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
data.PassengerId[data.Cabin.notnull()].count()

204

In [5]:
data.PassengerId[data.Age.notnull()].count()

714

### Заполнение пропусков

In [6]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
#data.Age = data.Age.median()
data.Age = data.Age.mean()

In [8]:
data["EmbarkedF"] = data["Embarked"].fillna("U")

In [9]:
data[data.Embarked.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,EmbarkedF
61,62,1,1,"Icard, Miss. Amelie",female,29.699118,0,0,113572,80.0,B28,,U
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,29.699118,0,0,113572,80.0,B28,,U


In [12]:
MaxPassEmbarked = data.groupby('Embarked').count()['PassengerId']

In [13]:
MaxPassEmbarked

Embarked
C    168
Q     77
S    644
Name: PassengerId, dtype: int64

In [14]:
data.Embarked[data.Embarked.isnull()] = MaxPassEmbarked[MaxPassEmbarked == MaxPassEmbarked.max()].index[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
data.PassengerId[data.Fare.isnull()]

Series([], Name: PassengerId, dtype: int64)

In [14]:
data = data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)

In [15]:
data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,28.0,1,0,7.25,S
1,1,1,female,28.0,1,0,71.2833,C
2,1,3,female,28.0,0,0,7.925,S
3,1,1,female,28.0,1,0,53.1,S
4,0,3,male,28.0,0,0,8.05,S
5,0,3,male,28.0,0,0,8.4583,Q
6,0,1,male,28.0,0,0,51.8625,S
7,0,3,male,28.0,3,1,21.075,S
8,1,3,female,28.0,0,2,11.1333,S
9,1,2,female,28.0,1,0,30.0708,C


In [28]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
dicts = {}

label.fit(data.Sex.drop_duplicates()) #задаем список значений для кодирования
dicts['Sex'] = list(label.classes_)
data.Sex = label.transform(data.Sex) #заменяем значения из списка кодами закодированных элементов 

label.fit(data.Embarked.drop_duplicates())
dicts['Embarked'] = list(label.classes_)
data.Embarked = label.transform(data.Embarked)

ytrain = data.Survived
Xtrain = data.drop(['Survived'],axis=1)

In [29]:
Xtrain.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,28.0,1,0,7.25,2
1,1,0,28.0,1,0,71.2833,0
2,3,0,28.0,0,0,7.925,2
3,1,0,28.0,1,0,53.1,2
4,3,1,28.0,0,0,8.05,2


### Проделываем все тоже самое с данными test

In [18]:
test = pd.read_csv('titanic_test.csv')
test.Age[test.Age.isnull()] = test.Age.mean()
test.Fare[test.Fare.isnull()] = test.Fare.median() #заполняем пустые значения средней ценой билета
MaxPassEmbarked = test.groupby('Embarked').count()['PassengerId']
test.Embarked[test.Embarked.isnull()] = MaxPassEmbarked[MaxPassEmbarked == MaxPassEmbarked.max()].index[0]
result = pd.DataFrame(test.PassengerId)
test = test.drop(['Name','Ticket','Cabin','PassengerId'],axis=1)

label.fit(dicts['Sex'])
test.Sex = label.transform(test.Sex)

label.fit(dicts['Embarked'])
test.Embarked = label.transform(test.Embarked)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [19]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [21]:
target = data.Survived
train = data.drop(['Survived'], axis=1) #из исходных данных убираем Id пассажира и флаг спасся он или нет
kfold = 5 #количество подвыборок для валидации

In [25]:
Xtrain, ytrain, Xtest, ytest = train_test_split(train, target, test_size=0.25)

In [26]:
model_knc = KNeighborsClassifier(n_neighbors = 18) #в параметре передаем кол-во соседей

In [30]:
scores = cross_val_score(model_knc, Xtrain, ytrain, cv = kfold)
print(scores.mean())

0.7250925335849836
