Acquiring the training and testing data.

In [1]:
# data importing
import pandas as pd
import numpy as np

train_dataset = pd.read_csv("train.csv")
test_dataset = pd.read_csv("test.csv")
passenger_id = test_dataset.iloc[:, 0].values

Analyzing the data

In [2]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [3]:
train_dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
train_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We have categorial as well as continious features.
Categorial: Survived, Pclass, Sex, Embarked
Continious: Age, Sibsp, Parch
Also, age count is 714 which represents we are having some missing values as (714 < 891)
Same for Embarked(889 < 891) and Cabin (204 < 891)

Cabin Data can be dropped as the available data is insufficient
Emabarked reprents the boarding port and it will not have the impact on survival as ticket system was there so people will be distributed randomly between class and cabins.

So we have to deal with the missing values of the age feature.

In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer_test = SimpleImputer(missing_values=np.nan, strategy="mean")

In [6]:
train_dataset["Age"] = imputer.fit_transform(train_dataset[["Age"]]).ravel()
test_dataset["Age"] = imputer_test.fit_transform(test_dataset[["Age"]]).ravel()

In [7]:
train_dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Dropping features that are not participating in prediction

In [8]:
features_drop = ['PassengerId','Name', 'Ticket','Cabin', 'Fare', 'Parch', 'SibSp', 'Embarked']
train_dataset.drop(features_drop, axis = 1, inplace = True)
test_dataset.drop(features_drop, axis = 1, inplace = True)

In [9]:
test_dataset.describe()

Unnamed: 0,Pclass,Age
count,418.0,418.0
mean,2.26555,30.27259
std,0.841838,12.634534
min,1.0,0.17
25%,1.0,23.0
50%,3.0,30.27259
75%,3.0,35.75
max,3.0,76.0


In [10]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
dtypes: float64(1), int64(2), object(1)
memory usage: 27.9+ KB


In [11]:
X = train_dataset.iloc[:, 1:].values
Y = train_dataset.iloc[:, 0].values
X_test = test_dataset.iloc[:,:].values

In [12]:
# Encoding the Categorial Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelEncoder_X = LabelEncoder()
X[:, 0] = labelEncoder_X.fit_transform(X[:, 0])

labelEncoder_X2 = LabelEncoder()
X[:, 1] = labelEncoder_X2.fit_transform(X[:, 1])

labelEncoder_X_test = LabelEncoder()
X_test[:, 0] = labelEncoder_X_test.fit_transform(X_test[:, 0])

labelEncoder_X_test2 = LabelEncoder()
X_test[:, 1] = labelEncoder_X_test2.fit_transform(X_test[:, 1])


In [13]:
onehotencoder = OneHotEncoder(categorical_features = [0,1])
X = onehotencoder.fit_transform(X).toarray()

onehotencoder_test = OneHotEncoder(categorical_features = [0,1])
X_test = onehotencoder_test.fit_transform(X_test).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [17]:
print(X_test)

[[ 0.          0.          1.          0.          1.         34.5       ]
 [ 0.          0.          1.          1.          0.         47.        ]
 [ 0.          1.          0.          0.          1.         62.        ]
 ...
 [ 0.          0.          1.          0.          1.         38.5       ]
 [ 0.          0.          1.          0.          1.         30.27259036]
 [ 0.          0.          1.          0.          1.         30.27259036]]


Support Vector Machine Model

In [18]:
# Fitting classifier to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = "linear", random_state = 0)
classifier.fit(X, Y)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [19]:
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [20]:
submission = pd.DataFrame({ 'PassengerId': passenger_id,
                            'Survived': y_pred })
submission.to_csv("submission.csv", index=False)

In [21]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
