In [114]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [3]:
imputer = Imputer(missing_values='NaN',strategy='mean')

In [4]:
gender = pd.read_csv('gender_submission.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# Test Data preprocessing

In [5]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [6]:
test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [7]:
test.drop('Name', axis=1, inplace = True)
test.drop('Ticket', axis=1, inplace=True)
test.drop('Cabin',axis=1,inplace=True)
test.drop('PassengerId',axis=1,inplace=True)
test['Parch'].unique()

array([0, 1, 3, 2, 4, 6, 5, 9], dtype=int64)

In [8]:
test.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
413,3,male,,0,0,8.05,S
414,1,female,39.0,0,0,108.9,C
415,3,male,38.5,0,0,7.25,S
416,3,male,,0,0,8.05,S
417,3,male,,1,1,22.3583,C


In [9]:
test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [10]:
test['Age'] = imputer.fit_transform(pd.DataFrame(test['Age']))
test['Fare'] = imputer.fit_transform(pd.DataFrame(test['Fare']))

In [11]:
test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [12]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()
test['Sex']=le.fit_transform(test['Sex'])

In [15]:
test['Fare'].max()

512.3292

In [16]:
test= pd.get_dummies(test, columns=['Embarked']) #since Country is the categorical data. If it is numeric we donot need to create dummies

In [17]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


# Train Data Preprocessing

In [18]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [19]:
train['Age'] = imputer.fit_transform(pd.DataFrame(train['Age']))

In [20]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [21]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [22]:
train.drop('Cabin', axis=1, inplace=True)
train.drop('Name', axis=1, inplace = True)
train.drop('Ticket', axis=1, inplace=True)
train.drop('PassengerId', axis=1, inplace=True)

In [23]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [24]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [25]:
freq_port = train.Embarked.dropna().mode()[0]
freq_port

'S'

In [26]:
train['Embarked'] = train['Embarked'].fillna(freq_port)

In [27]:
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [28]:
le = LabelEncoder()
train['Sex']=le.fit_transform(train['Sex'])

In [29]:
train= pd.get_dummies(train, columns=['Embarked']) #since Country is the categorical data. If it is numeric we donot need to create dummies

In [30]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [31]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

# Train Test Split

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
train.iloc[:,1:].head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,22.0,1,0,7.25,0,0,1
1,1,0,38.0,1,0,71.2833,1,0,0
2,3,0,26.0,0,0,7.925,0,0,1
3,1,0,35.0,1,0,53.1,0,0,1
4,3,1,35.0,0,0,8.05,0,0,1


In [58]:
X_train,X_test,y_train,y_test = train_test_split(train.iloc[:,1:],train.iloc[:,0],test_size=0.25,random_state=0)

In [59]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


# StandardScaler

In [60]:
from sklearn.preprocessing import StandardScaler

In [61]:
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)
sc_y = StandardScaler()
# Y_train = sc_y.fit_transform(pd.DataFrame(Y_train))

# Predictions

In [62]:
from sklearn.linear_model import LogisticRegression

In [63]:
# X_train = train.iloc[:,1:]
# y_train = train.iloc[:,:1]

In [64]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train,y_train)
y_pred = clf_lr.predict(X_test)

In [65]:
# X_test.head()

In [66]:
# y_test = gender['Survived']
y_test.shape

(223,)

In [67]:
y_pred.shape

(223,)

In [68]:
y_train.shape

(668,)

In [69]:
pd.DataFrame(list(zip(y_train, y_test)),columns=['Actual Output', 'Predicted Output']).head(10)

Unnamed: 0,Actual Output,Predicted Output
0,0,0
1,1,0
2,0,0
3,0,1
4,1,1
5,1,1
6,0,1
7,0,1
8,1,1
9,1,1


In [70]:
from sklearn.metrics import confusion_matrix

In [71]:
confusion_matrix(y_test,y_pred)

array([[116,  23],
       [ 23,  61]], dtype=int64)

# Cross Validataion

In [72]:
from sklearn.model_selection import cross_val_score

In [73]:
acc=cross_val_score(estimator=clf_lr,X=X_train,y=y_train,cv=10)

In [74]:
acc.mean()

0.7979873360470375

In [96]:
from sklearn.neighbors import KNeighborsClassifier

In [97]:
KNe = KNeighborsClassifier(n_neighbors=5)
KNe.fit(X_train,y_train)
y_pred = KNe.predict(X_test)
# pd.DataFrame(data=list(zip(y_test,y_pred)),columns=['y','y-hat'])
confusion_matrix(y_test,y_pred)

array([[117,  22],
       [ 23,  61]], dtype=int64)

In [98]:
acc = cross_val_score(estimator=KNe, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

0.8024423337856174

In [99]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [100]:
KNe.fit(train.drop('Survived',axis=1), train['Survived'])
y_pred = KNe.predict(test)
pd.DataFrame(list(zip(gender['PassengerId'], y_pred)),columns=['PassengerId', 'Survived']).to_csv('submission.csv',index=False)

In [80]:
from sklearn.svm import SVC

In [81]:
clf_SVC = SVC(kernel='linear')

In [82]:
clf_SVC.fit(X_train,y_train)
y_pred = clf_SVC.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[115,  24],
       [ 25,  59]], dtype=int64)

In [83]:
acc = cross_val_score(estimator=clf_SVC, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

0.7889868837630031

In [84]:
from sklearn.svm import SVC

In [85]:
clf_SVC = SVC() #default rbf
clf_SVC.fit(X_train,y_train)
y_pred = clf_SVC.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[121,  18],
       [ 25,  59]], dtype=int64)

# With Test Data

In [101]:
clf_SVC = SVC() #default rbf
clf_SVC.fit(train.drop('Survived',axis=1), train['Survived'])
y_pred = clf_SVC.predict(test)
y_pred.shape

(418,)

In [102]:
acc = cross_val_score(estimator=clf_SVC, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

0.8264360018091361

In [88]:
# pd.DataFrame(list(zip(gender['PassengerId'], y_pred)),columns=['PassengerId', 'Survived']).to_csv('submission.csv',index=False)

In [89]:
# clf_SVC = SVC(kernel='poly')
# clf_SVC.fit(X_train,y_train)
# y_pred = clf_SVC.predict(X_test)
# confusion_matrix(y_test,y_pred)

In [90]:
from sklearn.tree import DecisionTreeClassifier

In [91]:
DTre = DecisionTreeClassifier()
DTre.fit(X_train,y_train)
y_pred = DTre.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[114,  25],
       [ 29,  55]], dtype=int64)

In [65]:
acc = cross_val_score(estimator=DTre, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

0.7705197132616487

In [66]:
clf_SVC = SVC(kernel='sigmoid')
clf_SVC.fit(X_train,y_train)
y_pred = clf_SVC.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[142,  26],
       [ 36,  64]], dtype=int64)

In [67]:
acc = cross_val_score(estimator=clf_SVC, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

0.7206965245775729

In [103]:
from sklearn.ensemble import RandomForestClassifier

In [104]:
RFo = RandomForestClassifier()
RFo.fit(X_train,y_train)
y_pred = RFo.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[126,  13],
       [ 26,  58]], dtype=int64)

In [105]:
acc = cross_val_score(estimator=RFo, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

0.7949570330167346

In [106]:
RFo.fit(train.drop('Survived',axis=1), train['Survived'])
y_pred = RFo.predict(test)

In [107]:
pd.DataFrame(list(zip(gender['PassengerId'], y_pred)),columns=['PassengerId', 'Survived']).to_csv('submission.csv',index=False)

In [71]:
from sklearn.naive_bayes import GaussianNB

In [72]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[128,  40],
       [ 22,  78]], dtype=int64)

In [73]:
acc = cross_val_score(estimator=gnb, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

0.760917498719918

In [120]:
from sklearn.naive_bayes import GaussianNB #gaussian because it has gaussian curve

In [121]:
gnb = GaussianNB()
gnb.fit(X_train,y_train)
y_pred = gnb.predict(X_test)
confusion_matrix(y_test,y_pred)

array([[105,  34],
       [ 18,  66]], dtype=int64)

In [122]:
acc = cross_val_score(estimator=gnb, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

0.7830167345092718

In [123]:
# X = y_pred
# Y = gender['Survived']
# accuracy_score(X,Y)

ValueError: Found input variables with inconsistent numbers of samples: [223, 418]

In [142]:
from xgboost import XGBClassifier

In [143]:
clf_xg = XGBClassifier()
clf_xg.fit(X_train,y_train)
y_pred = clf_xg.predict(X_test)
confusion_matrix(y_test,y_pred)

  if diff:


array([[126,  13],
       [ 21,  63]], dtype=int64)

In [145]:
acc = cross_val_score(estimator=clf_xg, X=X_train, y=y_train, cv=10) #cv is no. of fold
acc.mean()

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.8218905472636816

In [146]:
X_test.shape

(223, 9)

In [147]:
y_test.shape

(223,)

# Test Data

In [79]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


In [None]:
# sc_x = StandardScaler()
# X_test = sc_x.fit_transform(test)
# X_test

In [None]:
# sc_x = StandardScaler()
# X_train = sc_x.fit_transform(X_train)
# X_test = sc_x.transform(test)
# sc_y = StandardScaler()
# y_test = sc_y.fit_transform(pd.DataFrame(gender['Survived']))

In [80]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [81]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


In [82]:
test.count()

Pclass        418
Sex           418
Age           418
SibSp         418
Parch         418
Fare          418
Embarked_C    418
Embarked_Q    418
Embarked_S    418
dtype: int64

In [83]:
gender.count()

PassengerId    418
Survived       418
dtype: int64

In [116]:
clf_SVC = SVC() #default rbf
clf_SVC.fit(train.drop('Survived',axis=1), train['Survived'])
y_pred = clf_SVC.predict(test)
# pd.DataFrame(list(zip(gender['PassengerId'], y_pred)),columns=['PassengerId', 'Survived']).to_csv('submission.csv',index=False)

In [None]:
X = y_pred
Y = gender['Survived']
accuracy_score(X,Y)

In [124]:
DTre.fit(train.drop('Survived',axis=1), train['Survived'])
y_pred = DTre.predict(test)
# pd.DataFrame(list(zip(gender['PassengerId'], y_pred)),columns=['PassengerId', 'Survived']).to_csv('submission.csv',index=False)

In [128]:
X = y_pred
Y = gender['Survived']
accuracy_score(X,Y)

0.8086124401913876

In [126]:
RFo.fit(train.drop('Survived',axis=1), train['Survived'])
y_pred = RFo.predict(test)
# pd.DataFrame(list(zip(gender['PassengerId'], y_pred)),columns=['PassengerId', 'Survived']).to_csv('submission.csv',index=False)

In [127]:
X = y_pred
Y = gender['Survived']
accuracy_score(X,Y)

0.8086124401913876

In [135]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


In [132]:
clf_lr.fit(train.drop('Survived',axis=1), train['Survived'])
y_pred = clf_lr.predict(test)
pd.DataFrame(list(zip(gender['PassengerId'], y_pred)),columns=['PassengerId', 'Survived']).to_csv('submission.csv',index=False)

In [130]:
X = y_pred
Y = gender['Survived']
accuracy_score(X,Y)

0.9497607655502392

In [131]:
pd.DataFrame(list(zip(X,Y)),columns=['Predicted', 'Actual'])

Unnamed: 0,Predicted,Actual
0,0,0
1,0,1
2,0,0
3,0,0
4,1,1
5,0,0
6,1,1
7,0,0
8,1,1
9,0,0


In [153]:
from xgboost import XGBClassifier

In [154]:
clf_xg = XGBClassifier()
clf_xg.fit(train.drop('Survived',axis=1), train['Survived'])
y_pred = clf_xg.predict(test)

pd.DataFrame(list(zip(gender['PassengerId'], y_pred)),columns=['PassengerId', 'Survived']).to_csv('submission.csv',index=False)

  if diff:


In [155]:
X = y_pred
Y = gender['Survived']
accuracy_score(X,Y)

0.8899521531100478

In [148]:
X.shape

(418,)

In [149]:
Y.shape

(418,)

In [152]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


In [151]:
acc = cross_val_score(estimator=clf_xg, X=test, y=Y, cv=10) #cv is no. of fold
acc.mean()

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


1.0