In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')


In [3]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [5]:
test.drop(columns=['Cabin'],inplace=True)
train.drop(columns=['Cabin'],inplace=True)

In [6]:
train.fillna(train['Age'].mean(),inplace=True)

In [7]:
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [8]:
test.fillna(test['Age'].mean(),inplace=True)

In [9]:
test.fillna(test['Fare'].mean(),inplace=True)

In [10]:
test['Embarked'].value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [11]:
#Replace embarked with 'S' as it is more
train['Embarked'].fillna('S',inplace=True)

In [12]:
train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [13]:
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [14]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [15]:
#After EDA we can say that Name,Ticket and passengerId are not required
passengerId=test['PassengerId']
#dropping Name,Ticket and passengerId from test and train
train.drop(columns=['Name','PassengerId','Ticket'],inplace=True)
test.drop(columns=['Name','PassengerId','Ticket'],inplace=True)

In [16]:
train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.000000,1,0,7.2500,S
1,1,1,female,38.000000,1,0,71.2833,C
2,1,3,female,26.000000,0,0,7.9250,S
3,1,1,female,35.000000,1,0,53.1000,S
4,0,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S
887,1,1,female,19.000000,0,0,30.0000,S
888,0,3,female,29.699118,1,2,23.4500,S
889,1,1,male,26.000000,0,0,30.0000,C


In [17]:
test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.50000,0,0,7.8292,Q
1,3,female,47.00000,1,0,7.0000,S
2,2,male,62.00000,0,0,9.6875,Q
3,3,male,27.00000,0,0,8.6625,S
4,3,female,22.00000,1,1,12.2875,S
...,...,...,...,...,...,...,...
413,3,male,30.27259,0,0,8.0500,S
414,1,female,39.00000,0,0,108.9000,C
415,3,male,38.50000,0,0,7.2500,S
416,3,male,30.27259,0,0,8.0500,S


In [18]:
#Handling categorical variable
#As the algorithm understands only numeric value so for that we have to perform encoding techniques on categorical variable
train=pd.get_dummies(train,columns=['Pclass','Sex','Embarked'],drop_first=True)
test=pd.get_dummies(test,columns=['Pclass','Sex','Embarked'],drop_first=True)

In [19]:
train.drop(columns=['Embarked_C'],inplace=True)

In [20]:
test

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,34.50000,0,0,7.8292,0,1,1,1,0
1,47.00000,1,0,7.0000,0,1,0,0,1
2,62.00000,0,0,9.6875,1,0,1,1,0
3,27.00000,0,0,8.6625,0,1,1,0,1
4,22.00000,1,1,12.2875,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
413,30.27259,0,0,8.0500,0,1,1,0,1
414,39.00000,0,0,108.9000,0,0,0,0,0
415,38.50000,0,0,7.2500,0,1,1,0,1
416,30.27259,0,0,8.0500,0,1,1,0,1


In [21]:
X=train.drop(columns=['Survived'],axis=1)

In [22]:
X

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,22.000000,1,0,7.2500,0,1,1,0,1
1,38.000000,1,0,71.2833,0,0,0,0,0
2,26.000000,0,0,7.9250,0,1,0,0,1
3,35.000000,1,0,53.1000,0,0,0,0,1
4,35.000000,0,0,8.0500,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...
886,27.000000,0,0,13.0000,1,0,1,0,1
887,19.000000,0,0,30.0000,0,0,0,0,1
888,29.699118,1,2,23.4500,0,1,0,0,1
889,26.000000,0,0,30.0000,0,0,1,0,0


In [23]:
y=train['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [24]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [25]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
model= DecisionTreeClassifier()

In [28]:
model.fit(X_train,y_train)

In [29]:
y_pred=model.predict(X_test)

In [30]:
accuracy_score(y_pred,y_test)

0.8044692737430168

In [31]:
test

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,34.50000,0,0,7.8292,0,1,1,1,0
1,47.00000,1,0,7.0000,0,1,0,0,1
2,62.00000,0,0,9.6875,1,0,1,1,0
3,27.00000,0,0,8.6625,0,1,1,0,1
4,22.00000,1,1,12.2875,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
413,30.27259,0,0,8.0500,0,1,1,0,1
414,39.00000,0,0,108.9000,0,0,0,0,0
415,38.50000,0,0,7.2500,0,1,1,0,1
416,30.27259,0,0,8.0500,0,1,1,0,1


In [32]:
Xf=test.iloc[:,:].values

In [33]:
Xf=scaler.transform(Xf)



In [34]:
y_final=model.predict(Xf)

In [35]:
y_final.shape

(418,)

In [36]:
final_pred=pd.DataFrame()

In [37]:
final_pred

In [38]:
final_pred['PassengerId']=passengerId
final_pred['Survived']=y_final

In [39]:
final_pred

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [40]:
final_pred.to_csv('Submission.csv',index=False)

In [41]:
Xf

array([[0.4282483 , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.58532295, 0.125     , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.77381252, 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.47851219, 0.        , 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.3751268 , 0.        , 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.3751268 , 0.125     , 0.2       , ..., 1.        , 0.        ,
        0.        ]])

In [42]:
model_new=LogisticRegression()

In [43]:
model_new.fit(X_train,y_train)

In [44]:
y_pred_new=model_new.predict(X_test)

In [45]:
accuracy_score(y_pred_new,y_test)

0.8603351955307262

In [46]:
y_final_new=model_new.predict(Xf)

In [47]:
y_final_new.shape

(418,)

In [48]:
final_pred_new=pd.DataFrame()

In [49]:
final_pred_new['PassengerId']=passengerId
final_pred_new['Survived']=y_final_new

In [50]:
final_pred_new

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [51]:
final_pred_new.to_csv("NewSubmission.csv",index=False)

In [52]:
from xgboost import XGBClassifier

my_model = XGBClassifier()
my_model.fit(X_train, y_train)

In [53]:
prediction=my_model.predict(X_test)

In [54]:
from sklearn import metrics

In [55]:
print(metrics.classification_report(prediction, y_test))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87       116
           1       0.75      0.81      0.78        63

    accuracy                           0.84       179
   macro avg       0.82      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179



In [56]:
accuracy_score(prediction, y_test)

0.8379888268156425