In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
#reading data from csv to dataframes
raw_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

data1 = raw_data.copy(deep = True)
data_cleaner = [data1,test_data]

In [None]:
data1.head()

In [None]:
test_data.head()

In [None]:
data1.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
#Creating datasets that will be later used for training and testing
X = data1[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
y = data1['Survived']

In [None]:
#filling null values in age using median
X['Age'] = X['Age'].fillna(X['Age'].median())
X.head(10)

In [None]:
#filling null values in embarked with mode
X['Embarked'].fillna(X['Embarked'].mode()[0], inplace = True)
X.isnull().sum()

In [None]:
#creating a new feature Family Size
X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
X['IsAlone'] = 1
X['IsAlone'].loc[X['FamilySize'] > 1] = 0

In [None]:
#performing one hot encoding
X = pd.get_dummies(X)
X

In [None]:
y

In [None]:
data1

In [None]:
#splitting data for test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [None]:
#plotting scatter matrix to see patterns
from matplotlib import cm
from pandas.plotting import scatter_matrix
cmap = cm.get_cmap('gnuplot')
scatter = scatter_matrix(X_train, c=y_train, marker='o', s=40, hist_kwds={'bins':15}, figsize=(18,18), cmap=cmap)

In [None]:
#plotting 3d plot to observe patterns
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection = '3d')
ax.scatter(X_train['Pclass'], X_train['Age'], X_train['Fare'], c = y_train, marker = 'o', s=100)
ax.set_xlabel('Pclass')
ax.set_ylabel('Age')
ax.set_zlabel('Fare')
plt.show()

In [None]:
#logistic regression using split data
from sklearn.linear_model import LogisticRegression

clf_half = LogisticRegression(max_iter=400).fit(X_train,y_train)
print('LOGISTIC REGRESSION on split data')
print('Split data : Score on training set : {:.2f}'.format(clf_half.score(X_train, y_train)))
print('Split data : Score on testing  set : {:.2f}'.format(clf_half.score(X_test, y_test)))

In [None]:
#logistic regression using full data
clf_full = LogisticRegression(max_iter=400).fit(X,y)
print('LOGISTIC REGRESSION on full data')
print('Full data : Score on training set : {:.2f}'.format(clf_full.score(X, y)))
#print('Split data : Score on testing  set : {:.2f}'.format(clf_full.score(X_test, y_test)))

In [None]:
#normalizing data using MinMax scaler and then using logistic regression on split data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf_half_scaled = LogisticRegression().fit(X_train_scaled,y_train)
print('LOGISTIC REGRESSION normalized on split data')
print('Split data : Score on training set : {:.2f}'.format(clf_half_scaled.score(X_train_scaled, y_train)))
print('Split data : Score on testing  set : {:.2f}'.format(clf_half_scaled.score(X_test_scaled, y_test)))

In [None]:
#logistic regression using normalized full data
scaler_full = MinMaxScaler()
X_scaled = scaler_full.fit_transform(X)

clf_full_scaled = LogisticRegression().fit(X_scaled,y)
print('LOGISTIC REGRESSION normalized on full data')
print('Full data : Score on training set : {:.2f}'.format(clf_full_scaled.score(X_scaled, y)))

In [None]:
#kernelized support vector machine using normalized split data
from sklearn.svm import SVC

svm_classifier = SVC(kernel='rbf',C=40).fit(X_train_scaled,y_train)
print('SVM using RBF on split data')
print('Split data : Score on training set : {:.2f}'.format(svm_classifier.score(X_train_scaled, y_train)))
print('Split data : Score on testing  set : {:.2f}'.format(svm_classifier.score(X_test_scaled, y_test)))

In [None]:
#kernelized support vector machine using normalized full data
svm_classifier_full = SVC(kernel='rbf',C=50).fit(X_scaled,y)
print('SVM using RBF on full data')
print('Full data : Score on training set : {:.2f}'.format(svm_classifier_full.score(X_scaled, y)))

In [None]:
#linear support vector machine using normalized split data
svm_classifier_linear = SVC(kernel='linear',C=50).fit(X_train_scaled,y_train)
print('SVM linear on split data')
print('Split data : Score on training set : {:.2f}'.format(svm_classifier_linear.score(X_train_scaled, y_train)))
print('Split data : Score on testing  set : {:.2f}'.format(svm_classifier_linear.score(X_test_scaled, y_test)))

In [None]:
#linear support vector machine using normalized full data
svm_classifier_full_linear = SVC(kernel='linear',C=50).fit(X_scaled,y)
print('SVM using RBF on full data')
print('Full data : Score on training set : {:.2f}'.format(svm_classifier_full_linear.score(X_scaled, y)))

In [None]:
#copy of actual test data
testdata = test_data.copy(deep=True)
testdata

In [None]:
#transforming actual testing data for prediction
X_testdata = testdata[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
X_testdata['Age'] = X_testdata['Age'].fillna(X_testdata['Age'].median())
X_testdata['Embarked'].fillna(X_testdata['Embarked'].mode()[0], inplace = True)
X_testdata['FamilySize'] = X_testdata['SibSp'] + X_testdata['Parch'] + 1
X_testdata['IsAlone'] = 1
X_testdata['IsAlone'].loc[X_testdata['FamilySize'] > 1] = 0
X_testdata = pd.get_dummies(X_testdata)

In [None]:
X_testdata.isnull().sum()

In [None]:
X_testdata['Fare'] = X_testdata['Fare'].fillna(X_testdata['Fare'].mean())

In [None]:
X_testdata.isnull().sum()

In [None]:
X.isnull().sum()

In [None]:
#prediction using logistic regression
predictions_logreg = clf_full_scaled.predict(X_testdata)
output_logreg = pd.DataFrame({'PassengerId': testdata.PassengerId, 'Survived': predictions_logreg})
output_logreg.to_csv('my_submission_logistic_regression.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
#prediction using svm classifier
predictions_svm = svm_classifier_full.predict(X_testdata)
output_svm = pd.DataFrame({'PassengerId': testdata.PassengerId, 'Survived': predictions_svm})
output_svm.to_csv('my_submission_svm_1.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
#prediction using svm classifier 2
predictions_svm_1 = svm_classifier.predict(X_testdata)
output_svm_1 = pd.DataFrame({'PassengerId': testdata.PassengerId, 'Survived': predictions_svm_1})
output_svm_1.to_csv('my_submission_svm_2.csv', index=False)
print("Your submission was successfully saved!")