In [37]:
import os
import numpy
import pandas
import matplotlib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics.classification import classification_report, accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

cols = numpy.array(['PassengerId', 'Survived' 'Pclass', 'Name', 'Sex', 'Age',
                    'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'])

# numeric columns
num_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


In [28]:
df_train = pandas.read_csv('./data/train.csv')
df_test = pandas.read_csv('./data/test.csv')

print(df_train.head(10))


   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   
5            6       3                                   Moran, Mr. James   
6            7       1                            McCarthy, Mr. Timothy J   
7            8       3                     Palsson, Master. Gosta Leonard   
8            9       3  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)   
9           10       2                Nasser, Mrs. Nicholas (Adele Achem)   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  f

In [20]:
X_train, y_train = df_train.loc[:, df_train.columns != 'Survived'], df_train.loc[:, df_train.columns == 'Survived']

X_test, y_test = df_test.loc[:, df_test.columns != 'Survived'], df_test.loc[:, df_test.columns == 'Survived']


  Cabin Embarked                                               Name     Sex  \
0   NaN        S                            Braund, Mr. Owen Harris    male   
1   C85        C  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2   NaN        S                             Heikkinen, Miss. Laina  female   
3  C123        S       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4   NaN        S                           Allen, Mr. William Henry    male   
5   NaN        Q                                   Moran, Mr. James    male   
6   E46        S                            McCarthy, Mr. Timothy J    male   
7   NaN        S                     Palsson, Master. Gosta Leonard    male   
8   NaN        S  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
9   NaN        C                Nasser, Mrs. Nicholas (Adele Achem)  female   

             Ticket  
0         A/5 21171  
1          PC 17599  
2  STON/O2. 3101282  
3            113803  
4            373450 

In [39]:
# numeric attributs
x_num_train = X_train[X_train.columns.difference(num_cols)]
x_num_test = X_test[X_test.columns.difference(num_cols)]

print(x_num_train.head(10))

  Cabin Embarked                                               Name     Sex  \
0   NaN        S                            Braund, Mr. Owen Harris    male   
1   C85        C  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2   NaN        S                             Heikkinen, Miss. Laina  female   
3  C123        S       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4   NaN        S                           Allen, Mr. William Henry    male   
5   NaN        Q                                   Moran, Mr. James    male   
6   E46        S                            McCarthy, Mr. Timothy J    male   
7   NaN        S                     Palsson, Master. Gosta Leonard    male   
8   NaN        S  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
9   NaN        C                Nasser, Mrs. Nicholas (Adele Achem)  female   

             Ticket  
0         A/5 21171  
1          PC 17599  
2  STON/O2. 3101282  
3            113803  
4            373450 

In [None]:

# scale to <0,1>
x_num_train = MinMaxScaler().fit(x_num_train)
x_num_test = MinMaxScaler().fit(x_num_test)
# max_train = numpy.amax(x_num_train, 0)
# max_test = numpy.amax(x_num_test, 0)        # not really needed
# 
# x_num_train = x_num_train / max_train
# x_num_test = x_num_test / max_train        # scale test by max_train

x_num_train.fillna(x_num_train.mean(), inplace=True)
x_num_test.fillna(x_num_test.mean(), inplace=True)

# labels or target attribute
y_train = y_train.astype(int)
y_test = y_test.astype(int)


print(x_num_train[:10])


  Cabin Embarked                                               Name     Sex  \
0   NaN        S                            Braund, Mr. Owen Harris    male   
1   C85        C  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2   NaN        S                             Heikkinen, Miss. Laina  female   
3  C123        S       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4   NaN        S                           Allen, Mr. William Henry    male   
5   NaN        Q                                   Moran, Mr. James    male   
6   E46        S                            McCarthy, Mr. Timothy J    male   
7   NaN        S                     Palsson, Master. Gosta Leonard    male   
8   NaN        S  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
9   NaN        C                Nasser, Mrs. Nicholas (Adele Achem)  female   

             Ticket  
0         A/5 21171  
1          PC 17599  
2  STON/O2. 3101282  
3            113803  
4            373450 

In [None]:

# categorical attributes
cat_train = X_train.drop(num_cols, axis=1)
cat_test = X_test.drop(num_cols, axis=1)

cat_train.fillna('NA', inplace=True)
cat_test.fillna('NA', inplace=True)

x_cat_train = cat_train.T.to_dict().values()
x_cat_test = cat_test.T.to_dict().values()


  Cabin Embarked                                               Name     Sex  \
0   NaN        S                            Braund, Mr. Owen Harris    male   
1   C85        C  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2   NaN        S                             Heikkinen, Miss. Laina  female   
3  C123        S       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4   NaN        S                           Allen, Mr. William Henry    male   
5   NaN        Q                                   Moran, Mr. James    male   
6   E46        S                            McCarthy, Mr. Timothy J    male   
7   NaN        S                     Palsson, Master. Gosta Leonard    male   
8   NaN        S  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
9   NaN        C                Nasser, Mrs. Nicholas (Adele Achem)  female   

             Ticket  
0         A/5 21171  
1          PC 17599  
2  STON/O2. 3101282  
3            113803  
4            373450 

In [15]:

# # vectorize (encode as one hot)
# vectorizer = DictVectorizer(sparse=False)
# vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
# vec_x_cat_test = vectorizer.transform(x_cat_test)
# 
# # build the feature vector
# x_train = numpy.hstack((x_num_train, vec_x_cat_train))
# x_test = numpy.hstack((x_num_test, vec_x_cat_test))

x_train = x_num_train
x_test = x_num_test

  Cabin Embarked                                               Name     Sex  \
0   NaN        S                            Braund, Mr. Owen Harris    male   
1   C85        C  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2   NaN        S                             Heikkinen, Miss. Laina  female   
3  C123        S       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4   NaN        S                           Allen, Mr. William Henry    male   
5   NaN        Q                                   Moran, Mr. James    male   
6   E46        S                            McCarthy, Mr. Timothy J    male   
7   NaN        S                     Palsson, Master. Gosta Leonard    male   
8   NaN        S  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female   
9   NaN        C                Nasser, Mrs. Nicholas (Adele Achem)  female   

             Ticket  
0         A/5 21171  
1          PC 17599  
2  STON/O2. 3101282  
3            113803  
4            373450 

In [16]:
model = LogisticRegression()
model.fit(x_train, y_train)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').