# Data gathering/Acquisition

In [1]:
#Reading Datasets
#download the dataset here, train.csv and test.csv https://www.kaggle.com/competitions/titanic/data
import pandas as pd
traindata=pd.read_csv("train.csv")
traindata.head()
traindata.shape

(891, 12)

In [2]:
#checking for null data
traindata.isnull().sum().sort_values(ascending=False)

Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
dtype: int64

# Preparation of Data 

In [3]:
#removing some parts to dataset 
def process(data):
    #removing columns to minimize the parameters to check
    cleandata = data.drop(columns=["PassengerId","Name","Ticket","Cabin"],axis=1)
    
    return(cleandata)

traindata=process(traindata)

In [4]:
traindata.isnull().sum().sort_values(ascending=False)

Age         177
Embarked      2
Survived      0
Pclass        0
Sex           0
SibSp         0
Parch         0
Fare          0
dtype: int64

In [5]:
#filling up the null data with mean value of the age
col=["Age"]

for cell in col:
    traindata[cell].fillna(traindata[cell].mean(),inplace=True)
    

In [6]:
#check for changes in the null 
traindata.isnull().sum().sort_values(ascending=False)

Embarked    2
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [7]:
#removing 2 null values in Embarked column
traindata.Embarked.fillna("U", inplace=True)

In [8]:
traindata.isnull().sum().sort_values(ascending=False)

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [9]:
# converting male = 1, female = 0 , C=0 , Q= 1 , S= 2, U=3
from sklearn import preprocessing
le= preprocessing.LabelEncoder()

cols=["Sex","Embarked"]

for col in cols:
    traindata[col]=le.fit_transform(traindata[col])
    print(le.classes_)
    
traindata.head()

['female' 'male']
['C' 'Q' 'S' 'U']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


#  Model Trial, Training, Testing

In [10]:
#splitting data into train and test 
from sklearn.model_selection import train_test_split
y= traindata["Survived"]
x= traindata.drop("Survived", axis=1)
x_train,x_test , y_train, y_test = train_test_split (x,y,
                                                     test_size= 0.2, #20% test size, 80% train size
                                                     random_state=150)

Support Vector Machine Model (default Parameters)

In [11]:
from sklearn import svm

#Create a svm Classifier
svmclf = svm.SVC() #no parameters, set to default

#Train the model using the training sets
svmclf.fit(x_train, y_train)

#Predict the response for test dataset
svm_pred = svmclf.predict(x_test)

from sklearn import metrics
# Model Accuracy Checking
print("SVM Accuracy:",metrics.accuracy_score(y_test, svm_pred)*100)

SVM Accuracy: 70.94972067039106


Logistic Regression Model

In [12]:
from sklearn.linear_model import LogisticRegression

#logistic regression model with 200 iteration 
lrclf= LogisticRegression(max_iter=200).fit(x_train,y_train) 
lr_pred= lrclf.predict(x_test)

print(" Logistics Regression Accuracy:",metrics.accuracy_score(y_test, lr_pred)*100) 

 Logistics Regression Accuracy: 85.47486033519553


Sequential Model 

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential()

model.add(Dense(64,activation = 'relu', input_dim=7))
model.add(Dense(32,activation = 'relu' ))
model.add(Dense(16,activation = 'relu' ))
model.add(Dense(16,activation = 'relu' ))
model.add(Dense(1,activation = 'sigmoid' ))

In [14]:
#compiling the layers of the model with optimizer, loss, metrics 
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [15]:
#model fitting
model.fit(x_train, y_train, epochs=100, batch_size = 128)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1f1fda09450>

In [16]:
#model evaluation 
_, accuracy=model.evaluate(x_train,y_train)

print('Accuracy: %.2f' %(accuracy*100))
seqaccu=accuracy*100
model.summary()

Accuracy: 78.65
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                512       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 16)                272       
                                                                 
 dense_4 (Dense)             (None, 1)                 17        
                                                                 
Total params: 3,409
Trainable params: 3,409
Non-trainable params: 0
_________________________________________________________________


Perception Model

In [17]:
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
perpred = perceptron.predict(x_test)
peraccu=metrics.accuracy_score(y_test, perpred,)
print(peraccu*100)

70.94972067039106


# Hypertuning the best model


Super Vector Machine Parameters hypertuning

In [18]:
#import gridsearch
from sklearn.model_selection import GridSearchCV
#setting parameters to test for fitting
svmparam_grid = {'C':[0.1, 1, 10, 100, 1000],'gamma':[1,0.1,0.01,0.001],'kernel':['rbf']}
gridsvm=GridSearchCV(svmclf,svmparam_grid, refit = True, verbose=3)
gridsvm.fit(x_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.615 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.615 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.620 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.620 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.613 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.615 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.615 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.620 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.620 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.613 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.615 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [19]:
#display best performing parameters
print("best parameters: ",gridsvm.best_params_)
print("best estimators: ",gridsvm.best_estimator_)

best parameters:  {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
best estimators:  SVC(C=100, gamma=0.001)


In [20]:
tunedsvmclf = svm.SVC(C=100, gamma= 0.001, kernel='rbf') #with hypertuned parameters 

#Train the model using the training sets
tunedsvmclf.fit(x_train, y_train)

#Predict the response for test dataset
tunedsvm_pred = tunedsvmclf.predict(x_test)

from sklearn import metrics
# Model Accuracy Checking with new parameters 
print("With tuned parameters SVM Accuracy:",metrics.accuracy_score(y_test, tunedsvm_pred)*100)

tunedsvmaccu=(metrics.accuracy_score(y_test, tunedsvm_pred) * 100)

With tuned parameters SVM Accuracy: 87.70949720670392


Logistic Regression Hypertuning

In [21]:
#hyper tuning for logistic regression
import numpy as np
lrparam_grid = [ 
    {'penalty' : ['l1', 'l2'],
     'C' :np.logspace(-4, 4, 20),
     'solver' : ['liblinear','sag','saga'],
     'max_iter' : [100, 1000,2500, 5000]
    }
]
gridlr = GridSearchCV(LogisticRegression(), param_grid = lrparam_grid, cv = 3, verbose=True, n_jobs=-1)
gridlr.fit(x_train,y_train)

Fitting 3 folds for each of 480 candidates, totalling 1440 fits


240 fits failed out of a total of 1440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\hp\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver sag sup

In [22]:
#displaying best parameters
print("best parameters: ",gridlr.best_params_)
print("best estimators: ",gridlr.best_estimator_)

best parameters:  {'C': 0.615848211066026, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
best estimators:  LogisticRegression(C=0.615848211066026, penalty='l1', solver='liblinear')


In [23]:
#logistic regression model with best parameters in hypertuning
tunedlrclf= LogisticRegression(C= 0.615848211066026, max_iter= 100,penalty= 'l1', solver= 'liblinear').fit(x_train,y_train)

#using the model with new parameters for prediction
tunedlr_pred= tunedlrclf.predict(x_test)

# Model Accuracy Checking with new parameters 
print(" Logistics Regression Accuracy:",metrics.accuracy_score(y_test, tunedlr_pred)*100) 

tunedlraccu=metrics.accuracy_score(y_test, tunedlr_pred)*100

 Logistics Regression Accuracy: 86.03351955307262


# Summary of Results

ranking the four model with tuned parameters(except sequential and perception) used we have:

In [24]:
print("Support Vector Machine: ",tunedsvmaccu)
print("   Logistic Regression: ",tunedlraccu)
print("      sequential Model: ",seqaccu)
print("      Perception Model: ",peraccu*100 )

Support Vector Machine:  87.70949720670392
   Logistic Regression:  86.03351955307262
      sequential Model:  78.65168452262878
      Perception Model:  70.94972067039106


# Testing the Models using test Data

In [25]:
#Preprocessing test data 
testdata=pd.read_csv("test.csv")
yid=testdata["PassengerId"]  #storing passenger id to a variable for later use
print(yid)
testdata.head()

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [26]:
#removing the other columns based on our preprocessing in the training data
testdata=process(testdata)
testdata.head()
testdata.isnull().sum().sort_values(ascending=False)

Age         86
Fare         1
Pclass       0
Sex          0
SibSp        0
Parch        0
Embarked     0
dtype: int64

In [27]:
#Filling null values the average of the data where it belongs
col=["Age","Fare"]
for cell in col:
    testdata[cell].fillna(testdata[cell].mean(),inplace=True)

#converting text to numeric value same on the last preprocessing
cols=["Sex","Embarked"]

for col in cols:
    testdata[col]=le.fit_transform(testdata[col])
    print(le.classes_)
    
traindata.head()    
    
testdata.isnull().sum().sort_values(ascending=False) #Checking null values


['female' 'male']
['C' 'Q' 'S']


Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [28]:
#prediction of the three models used
svmprediction = tunedsvmclf.predict(testdata)
lrprediction  = tunedlrclf.predict(testdata)
seqprediction =  model.predict(testdata)
perprediction = perceptron.predict(testdata)

#rounding off the values of the sequential model prediction
seqprediction = np.array(seqprediction)
seqprediction = seqprediction.flatten()
seqprediction[seqprediction<0.5] = 0
seqprediction[seqprediction>0] = 1
seqprediction = seqprediction.astype(int)





# Saving prediction of the Models

In [29]:
#saving svm model prediction
df= pd.DataFrame({"PassengerId":yid.values,
                "Survived":svmprediction})

df.to_csv("latestsvmsubmission.csv",index=False)

In [30]:
#saving logistic regression prediction
df= pd.DataFrame({"PassengerId":yid.values,
                "Survived":lrprediction})

df.to_csv("latestlrsubmission.csv",index=False)

In [31]:
#saving sequential model prediction
df= pd.DataFrame({"PassengerId":yid.values,
                "Survived":seqprediction})

df.to_csv("latestseqsubmission.csv",index=False)

In [32]:
#saving sequential model prediction
df= pd.DataFrame({"PassengerId":yid.values,
                "Survived":perprediction})

df.to_csv("latestpersubmission.csv",index=False)

# Comparisons,Analysis, Conclusions

In [33]:
#creating table
models = pd.DataFrame({'Model':["SVM","Perceptron","Sequential","Logistic Regression"],
                       'Score': [tunedsvmaccu,peraccu*100,seqaccu,tunedlraccu],
                        'Kaggle Score:':[0.73205,0.65550,0.74401,0.76555],
                      "hypertuned?": ["Yes","No","No","Yes"]}
                     )
                    
models.sort_values(by = 'Score', ascending = False, ignore_index = True)

Unnamed: 0,Model,Score,Kaggle Score:,hypertuned?
0,SVM,87.709497,0.73205,Yes
1,Logistic Regression,86.03352,0.76555,Yes
2,Sequential,78.651685,0.74401,No
3,Perceptron,70.949721,0.6555,No


 Model Analysis and Comparisons
 
 As we can see in the summary of the result, the best accuracy goes to the Support Vector Machine, followed by logistic regression, sequential, and Perception model.
 
 Eventhough we know that the accuracy of the SVM shows a higher percentage, the most realible model here is the logistic regression. As I try to used the results of my prediction to check it in the Kaggle competition, Logistic Regression Model got the highest accuracy rate. 


Another observation is that the hypertuned model performs better since different parameters were tried just to find the highest accuracy possible.

Conclusions

Between machine learning(SVM,Logistic Regression) and nueral networks(sequential, Perceptron), I observed that Machine Learning models performs well compared to nueral networks in this type of problem. I actually think that the reason why the nueral networks have poor performance is because I lack familiarity to the models and how to use them.




P.S: I really love the programming exam and trying to engineered data into much readable values for the machine to analyzed. This exam surely made me realize the surface of the data science field and I am thrilled to learn as much as I can.

P.P.S: I'll try to create and customized this data even after the exam ends. I'll try to be on the Top 1% of the kaggle competition hehehehehe 


In [34]:
#Stephen Felix 
#love the exam but not fun because of the time constraints sana all full term
#Kaggle Novice 
