# preprocess data


In [2]:
import pandas as pd

dt = pd.read_csv('train.csv')

dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## 1. sex dummies

In [3]:
sex_dummies = pd.get_dummies(dt.Sex, prefix='Sex')

dt = pd.concat([dt, sex_dummies], axis=1)

dt.info()

dt.to_csv('train_1_sex_dummies.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Sex_female     891 non-null uint8
Sex_male       891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(2)
memory usage: 85.3+ KB


## 2. cabin dummies

In [6]:
dt.Cabin.loc[dt.Cabin.notnull()] = 'notnull'

dt.Cabin.loc[dt.Cabin.isnull()] = 'null'

cabin_dummies = pd.get_dummies(dt.Cabin, prefix='Cabin')

dt = pd.concat([dt, cabin_dummies], axis=1)

dt.info()

dt.to_csv('train_2_cabin_dummies.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              714 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(4)
memory usage: 87.1+ KB


## 3. pclass dummies

In [8]:
pclass_dummies = pd.get_dummies(dt.Pclass, prefix='Pclass')


dt = pd.concat([dt, pclass_dummies], axis=1)

dt.info()

dt.to_csv('train_3_pclass_dummies.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              714 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(7)
memory usage: 89.7+ KB


## 4. embarked dummies

In [10]:
embarked_dummies = pd.get_dummies(dt.Embarked, prefix='Embarked')

dt = pd.concat([dt, embarked_dummies], axis=1)

dt.info()

dt.to_csv('train_4_embarked_dummies.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              714 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 n

## 5. add mean age

In [14]:
age = dt.Age
age.loc[dt.Age.isnull()] = dt.Age.mean()

dt.Age = age

dt.info()

dt.to_csv('train_5_add_mean_age.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 n

## 6. scale age and fare

In [20]:
import sklearn.preprocessing as preprocessing

scaler = preprocessing.StandardScaler()

age_scale_param = scaler.fit(dt.Age)

dt['Age_scale'] = scaler.fit_transform(dt.Age, age_scale_param)

fare_scale_param = scaler.fit(dt.Fare)

dt['Fare_scale'] = scaler.fit_transform(dt.Fare, fare_scale_param)

dt.info()

dt.to_csv('train_6_age_scale_fare_scale.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 27 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 n



In [24]:
x_train = dt.filter(regex='SibSp|Parch|Sex_.*|Cabin_.*|Pclass_.*|Embarked_.*|Age_.*|Fare_.*')


ValueError: cannot reindex from a duplicate axis