In [48]:
import pandas as pd
import matplotlib.pyplot as plt

In [49]:
c_train = pd.read_csv('train.csv')
c_test = pd.read_csv('test.csv')

In [50]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1.0,0,7.25,S
1,1,1,female,38.0,1.0,0,71.2833,C
2,1,3,female,26.0,0.0,0,7.925,S
3,1,1,female,35.0,1.0,0,53.1,S
4,0,3,male,35.0,0.0,0,8.05,S


In [51]:
train = c_train.drop('Cabin', axis=1)
test = c_test.drop('Cabin', axis=1)

In [52]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [53]:
# Using KNN Imputer to fill the missing values

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)

In [54]:
train[['Age','SibSp']] = imputer.fit_transform(train[['Age','SibSp']])
test[['Age','SibSp']] = imputer.fit_transform(test[['Age','SibSp']])

In [55]:
train["Embarked"] = train["Embarked"].fillna('U')
test["Embarked"] = test["Embarked"].fillna('U')

In [56]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0.0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0.0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0.0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,30.0,1.0,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0.0,0,111369,30.0000,C


# Feature Engineering

In [57]:
train = train.drop(['Name', 'Ticket', 'PassengerId'],axis=1)
test = test.drop(['Name', 'Ticket', 'PassengerId'],axis=1)

In [58]:
train_sex = pd.get_dummies(train["Sex"],dtype=float)
test_sex = pd.get_dummies(test["Sex"],dtype=float)

In [59]:
train_emb = pd.get_dummies(train["Embarked"], prefix="Embarked",dtype=float)
test_emb = pd.get_dummies(test["Embarked"], prefix="Embarked",dtype=float)

In [60]:
main_train = pd.concat([train, train_emb,train_sex], axis=1)
main_test = pd.concat([test, test_emb,test_sex], axis=1)

In [61]:
main_train = main_train.drop(['Sex','Embarked'],axis= 1)
main_test = main_test.drop(['Sex','Embarked'],axis= 1)

In [62]:
x = main_train.drop('Survived',axis=1)
y = main_train['Survived']

In [63]:
from sklearn.model_selection import train_test_split

In [64]:
# Splitting The Data Set
x_train, x_valid, y_train, y_valid= train_test_split(x, y, test_size=0.2)

In [65]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

In [66]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())

# Using different Algorithms to check for the Best Score

In [67]:
pipe.fit(x_train, y_train)

In [68]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier ,GradientBoostingClassifier, AdaBoostClassifier

In [69]:
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)

In [70]:
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

In [71]:
svm_model = SVC()
svm_model.fit(x_train, y_train)

In [72]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)

In [73]:
gb_model = GradientBoostingClassifier()
gb_model.fit(x_train, y_train)

In [74]:
ab_model = AdaBoostClassifier()
ab_model.fit(x_train, y_train)

In [75]:
from sklearn.metrics import f1_score

In [76]:
accuracy = pipe.score(x_valid, y_valid)
lr_predicted_labels = pipe.predict(x_valid)

print("Accuracy Logistic Regression: %.2f" % accuracy)
print("F1 Score: %.2f" % f1_score(y_valid, lr_predicted_labels))

Accuracy Logistic Regression: 0.80
F1 Score: 0.74


In [77]:
print("Accuracy DecisionTreeClassifier: %.2f" % dt_model.score(x_valid, y_valid))
dt_predicted_labels = dt_model.predict(x_valid)
print("F1 Score: %.2f" % f1_score(y_valid, dt_predicted_labels))

Accuracy DecisionTreeClassifier: 0.79
F1 Score: 0.75


In [78]:
print("Accuracy Gaussian NB: %.2f" % nb_model.score(x_valid, y_valid))
nb_predicted_labels = nb_model.predict(x_valid)
print("F1 Score: %.2f" % f1_score(y_valid, nb_predicted_labels))

Accuracy Gaussian NB: 0.78
F1 Score: 0.72


In [79]:
print("Accuracy SVM: %.2f" % svm_model.score(x_valid, y_valid))
svm_predicted_labels = svm_model.predict(x_valid)
print("F1 Score: %.2f" % f1_score(y_valid, svm_predicted_labels))

Accuracy SVM: 0.66
F1 Score: 0.42


In [80]:
print("Random Forest Classifier: %.2f" % rf_model.score(x_valid, y_valid))
rf_predicted_labels = rf_model.predict(x_valid)
print("F1 Score: %.2f" % f1_score(y_valid, rf_predicted_labels))

Random Forest Classifier: 0.78
F1 Score: 0.73


In [81]:
print("Gradient Boosting Classifier: %.2f" % gb_model.score(x_valid, y_valid))
gb_predicted_labels = gb_model.predict(x_valid)
print("F1 Score: %.2f" % f1_score(y_valid, gb_predicted_labels))

Gradient Boosting Classifier: 0.84
F1 Score: 0.79


In [82]:
print(" AdaBoost Classifier: %.2f" % ab_model.score(x_valid, y_valid))
ab_predicted_labels = ab_model.predict(x_valid)
print("F1 Score: %.2f" % f1_score(y_valid, ab_predicted_labels))

 AdaBoost Classifier: 0.77
F1 Score: 0.71


In [83]:
main_test['Embarked_U'] = 0.0
salary_col = main_test.pop('Embarked_U')
main_test.insert(8, 'Embarked_U', salary_col)

In [84]:
main_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,female,male
0,3,34.50,0.0,0,7.8292,0.0,1.0,0.0,0.0,0.0,1.0
1,3,47.00,1.0,0,7.0000,0.0,0.0,1.0,0.0,1.0,0.0
2,2,62.00,0.0,0,9.6875,0.0,1.0,0.0,0.0,0.0,1.0
3,3,27.00,0.0,0,8.6625,0.0,0.0,1.0,0.0,0.0,1.0
4,3,22.00,1.0,1,12.2875,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
413,3,48.25,0.0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
414,1,39.00,0.0,0,108.9000,1.0,0.0,0.0,0.0,1.0,0.0
415,3,38.50,0.0,0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
416,3,48.25,0.0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0


In [85]:
# Dropping The NaN value causing error while fitting the test data
main_test = main_test.dropna()

In [86]:
main_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,female,male
0,3,34.50,0.0,0,7.8292,0.0,1.0,0.0,0.0,0.0,1.0
1,3,47.00,1.0,0,7.0000,0.0,0.0,1.0,0.0,1.0,0.0
2,2,62.00,0.0,0,9.6875,0.0,1.0,0.0,0.0,0.0,1.0
3,3,27.00,0.0,0,8.6625,0.0,0.0,1.0,0.0,0.0,1.0
4,3,22.00,1.0,1,12.2875,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
413,3,48.25,0.0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
414,1,39.00,0.0,0,108.9000,1.0,0.0,0.0,0.0,1.0,0.0
415,3,38.50,0.0,0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
416,3,48.25,0.0,0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0


In [87]:
Survived = gb_model.predict(main_test)

In [91]:
output = pd.concat([c_test['PassengerId'], pd.Series(Survived)], axis=1)

In [93]:
output.to_csv('output.csv')