# Import libraries

In [1]:
import numpy as np 
import pandas as pd

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train=pd.read_csv("/kaggle/input/titanic/train.csv")
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Handling NA values

In [3]:
# check missing values

df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df=df_train.dropna()
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.nunique()

PassengerId    183
Survived         2
Pclass           3
Name           183
Sex              2
Age             63
SibSp            4
Parch            4
Ticket         127
Fare            93
Cabin          133
Embarked         3
dtype: int64

# Creating a new DF with relevant columns

In [6]:
# "survived" is our label 
# passenger_id and name are unique and won't help deduce patterns, so ignore those
# Add the rest to a new dataframe "df"

df=df.drop(["Name","PassengerId","Cabin","Ticket"], axis=1)

In [7]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,1,female,38.0,1,0,71.2833,C
3,1,1,female,35.0,1,0,53.1,S
6,0,1,male,54.0,0,0,51.8625,S
10,1,3,female,4.0,1,1,16.7,S
11,1,1,female,58.0,0,0,26.55,S


In [8]:
# count of unique values in each column
print(df.nunique())

Survived     2
Pclass       3
Sex          2
Age         63
SibSp        4
Parch        4
Fare        93
Embarked     3
dtype: int64


# Data Manipulation

In [9]:
# label encode all string categorical columns (embarked and sex)
# embarked : S,Q,C,na ->(0,1,2,3) respectively and sex: M,F (0,1)

embark=[]
for i in df["Embarked"]:
    if i=="S":
        embark.append(0)
    elif i=="Q":
        embark.append(1)
    elif i=="C":
        embark.append(2)
    else:
        embark.append(3)
sex=[]
for i in df["Sex"]:
    if i=='M':
        sex.append(0)
    else:
        sex.append(1)
        
df=df.drop(["Sex"], axis=1)
df=df.drop(["Embarked"], axis=1)

df["Sex"]=sex
df["Embarked"]=embark

In [10]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
1,1,1,38.0,1,0,71.2833,1,2
3,1,1,35.0,1,0,53.1,1,0
6,0,1,54.0,0,0,51.8625,1,0
10,1,3,4.0,1,1,16.7,1,0
11,1,1,58.0,0,0,26.55,1,0


In [11]:
# Normalizing the values in the dataframe
from sklearn import preprocessing

df_temp=df
scaler = preprocessing.MinMaxScaler()
names = df_temp.columns
#names.remove("Loan_ID")
d = scaler.fit_transform(df_temp)
df = pd.DataFrame(d, columns = names)

In [12]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
0,1.0,0.0,0.468892,0.333333,0.0,0.139136,0.0,1.0
1,1.0,0.0,0.430956,0.333333,0.0,0.103644,0.0,0.0
2,0.0,0.0,0.671219,0.0,0.0,0.101229,0.0,0.0
3,1.0,1.0,0.038948,0.333333,0.25,0.032596,0.0,0.0
4,1.0,0.0,0.721801,0.0,0.0,0.051822,0.0,0.0


# Model implementation

In [13]:
# Split the dataframe into data and labels
X_train = df.drop('Survived', axis=1) # data
y_train = df.Survived # labels

# Logistic Regression

In [14]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, metrics


log_model=LogisticRegression().fit(X_train, y_train)
acc_log = round(log_model.score(X_train, y_train) * 100, 2)

print("Accuracy: %s" % acc_log)

Accuracy: 69.95


# Decision Tree

In [15]:
# Decision Tree Classifier
from sklearn import tree

dt_model=tree.DecisionTreeClassifier().fit(X_train, y_train)
acc_dt=round(dt_model.score(X_train, y_train)*100, 2)

print("Accuracy: %s" % acc_dt)

Accuracy: 100.0


# SVM

In [16]:
# Using SVM
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svc=SVC(kernel='rbf', C=100.0) # change values of C=1,100,1000, kernel=linear and C=1.0,100,1000
svm_model=svc.fit(X_train,y_train)
acc_svm=round(svm_model.score(X_train, y_train)*100, 2)

print('Accuracy: %s' % acc_svm)

Accuracy: 79.23


In [17]:
# RBF Kernel is popular because of its similarity to K-Nearest Neighborhood Algorithm. 
# It has the advantages of K-NN and overcomes the space complexity problem as RBF Kernel Support Vector Machines 
# just needs to store the support vectors during training and not the entire dataset

# Prediction and Submission

In [18]:
test_df=pd.read_csv("/kaggle/input/titanic/test.csv")
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [19]:
df_model=pd.DataFrame()
df_model=test_df.drop(["Name","PassengerId","Cabin","Ticket"], axis=1)

In [20]:
embark=[]
for i in test_df["Embarked"]:
    if i=="S":
        embark.append(0)
    elif i=="Q":
        embark.append(1)
    elif i=="C":
        embark.append(2)
    else:
        embark.append(3)
sex=[]
for i in test_df["Sex"]:
    if i=='M':
        sex.append(0)
    else:
        sex.append(1)
        
df_model=df_model.drop(["Sex"], axis=1)
df_model=df_model.drop(["Embarked"], axis=1)

df_model["Sex"]=sex
df_model["Embarked"]=embark

In [21]:
df_temp=df_model
scaler = preprocessing.MinMaxScaler()
names = df_temp.columns
#names.remove("Loan_ID")
d = scaler.fit_transform(df_temp)
df_model = pd.DataFrame(d, columns = names)

In [22]:
df_model.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
0,1.0,0.452723,0.0,0.0,0.015282,0.0,0.5
1,1.0,0.617566,0.125,0.0,0.013663,0.0,0.0
2,0.5,0.815377,0.0,0.0,0.018909,0.0,0.5
3,1.0,0.353818,0.0,0.0,0.016908,0.0,0.0
4,1.0,0.287881,0.125,0.111111,0.023984,0.0,0.0


In [23]:
df_model.isnull().sum()

Pclass       0
Age         86
SibSp        0
Parch        0
Fare         1
Sex          0
Embarked     0
dtype: int64

In [24]:
#replace null value in Fare as 0
df_model["Fare"].fillna(0.5, inplace = True)
df_model["Age"].fillna(0.5, inplace = True)

In [25]:
predictions = dt_model.predict(df_model)
predictions= [int(x) for x in predictions]

In [26]:
len(predictions)

418

In [27]:
sub_csv=pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
sub_csv.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [28]:
submission=pd.DataFrame()
submission["PassengerId"]=test_df["PassengerId"]
submission["Survived"]=predictions
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [29]:
submission.to_csv("submission.csv", index=False)