# preprocess data


In [1]:
import pandas as pd

df = pd.read_csv('train.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## 1. sex dummies

In [2]:
sex_dummies = pd.get_dummies(df.Sex, prefix='Sex')

df = pd.concat([df, sex_dummies], axis=1)

df.info()

df.to_csv('train_1_sex_dummies.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
Sex_female     891 non-null uint8
Sex_male       891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(2)
memory usage: 85.3+ KB


## 2. cabin dummies

In [3]:
df.Cabin.loc[df.Cabin.notnull()] = 'notnull'

df.Cabin.loc[df.Cabin.isnull()] = 'null'

cabin_dummies = pd.get_dummies(df.Cabin, prefix='Cabin')

df = pd.concat([df, cabin_dummies], axis=1)

df.info()

df.to_csv('train_2_cabin_dummies.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              714 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(4)
memory usage: 87.1+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## 3. pclass dummies

In [4]:
pclass_dummies = pd.get_dummies(df.Pclass, prefix='Pclass')


df = pd.concat([df, pclass_dummies], axis=1)

df.info()

df.to_csv('train_3_pclass_dummies.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              714 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(7)
memory usage: 89.7+ KB


## 4. embarked dummies

In [5]:
embarked_dummies = pd.get_dummies(df.Embarked, prefix='Embarked')

df = pd.concat([df, embarked_dummies], axis=1)

df.info()

df.to_csv('train_4_embarked_dummies.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              714 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(10)
memory usage: 92.3+ KB


## 5. add mean age and mean fare

In [6]:
df.Age.loc[df.Age.isnull()] = df.Age.mean()
df.Fare.loc[df.Fare.isnull()] = df.Fare.mean()
    
df.info()

df.to_csv('train_5_add_mean_age.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 non-null uint8
dtypes: float64(2), int64(5), object(5), uint8(10)
memory usage: 92.3+ KB


## 6. scale age and fare

In [7]:
import sklearn.preprocessing as preprocessing

scaler = preprocessing.StandardScaler()

age_scale_param = scaler.fit(df.Age)

df['Age_scale'] = scaler.fit_transform(df.Age, age_scale_param)

fare_scale_param = scaler.fit(df.Fare)

df['Fare_scale'] = scaler.fit_transform(df.Fare, fare_scale_param)

df.info()

df.to_csv('train_6_age_scale_fare_scale.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 24 columns):
PassengerId      891 non-null int64
Survived         891 non-null int64
Pclass           891 non-null int64
Name             891 non-null object
Sex              891 non-null object
Age              891 non-null float64
SibSp            891 non-null int64
Parch            891 non-null int64
Ticket           891 non-null object
Fare             891 non-null float64
Cabin            891 non-null object
Embarked         889 non-null object
Sex_female       891 non-null uint8
Sex_male         891 non-null uint8
Cabin_notnull    891 non-null uint8
Cabin_null       891 non-null uint8
Pclass_1         891 non-null uint8
Pclass_2         891 non-null uint8
Pclass_3         891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 non-null uint8
Age_scale        891 non-null float64
Fare_scale       891 non-null float64
dtypes: float64(4)



## get train data after preprocess

In [11]:
import preprocess_data as prepro
x_train, y_train, scale_param =  prepro.get_train_data('train.csv')



## Logistic Regression model

In [13]:
from sklearn import linear_model

lr = linear_model.LogisticRegression(tol=1e-6)

lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-06,
          verbose=0, warm_start=False)

## predict in test dataset

In [15]:
x_test = prepro.get_test_data('test.csv', scale_param)

prediction = lr.predict(x_test)

test_df = pd.read_csv('test.csv')

result = pd.DataFrame({'PassengerID':test_df.PassengerId.as_matrix(), 'Survived':prediction})

result.to_csv('lr_prediction.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
SibSp            418 non-null int64
Parch            418 non-null int64
Sex_female       418 non-null uint8
Sex_male         418 non-null uint8
Cabin_notnull    418 non-null uint8
Cabin_null       418 non-null uint8
Pclass_1         418 non-null uint8
Pclass_2         418 non-null uint8
Pclass_3         418 non-null uint8
Embarked_C       418 non-null uint8
Embarked_Q       418 non-null uint8
Embarked_S       418 non-null uint8
Age_scale        418 non-null float64
Fare_scale       418 non-null float64
dtypes: float64(2), int64(2), uint8(10)
memory usage: 17.2 KB




In [None]:
first submission: 0.75598