**Titanic: Machine Learning from Disaster**

In [1]:
import pandas as pd

***READING THE TRAINING AND TESTING DATA FROM CSV FILES***

In [2]:
train_data=pd.read_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\train.csv')
train_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_data=pd.read_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


***CHECKING FOR NULL VALUES IN THE DATA***

In [4]:
print("train data with null value:\n",train_data.isnull().sum())
print("test data with null value:\n",test_data.isnull().sum())

train data with null value:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
test data with null value:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


***REMOVING THE NULL VALUES AND CONVERTING THE CATEOGORICAL DATA***

In [5]:
train_data['Age'].fillna(train_data['Age'].median(),inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)

train_data['Embarked'].fillna(train_data['Embarked'].mode()[0],inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0],inplace=True)

train_data['Fare'].fillna(train_data['Fare'].median(),inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(),inplace=True)

train_data.loc[train_data["Sex"] == "male","Sex"]=0
train_data.loc[train_data["Sex"] == "female","Sex"]=1

test_data.loc[test_data["Sex"] == "male","Sex"]=0
test_data.loc[test_data["Sex"] == "female","Sex"]=1

train_data.loc[train_data["Embarked"] == "Q","Embarked"]=0
train_data.loc[train_data["Embarked"] == "S","Embarked"]=1
train_data.loc[train_data["Embarked"] == "C","Embarked"]=2
test_data.loc[test_data["Embarked"] == "Q","Embarked"]=0
test_data.loc[test_data["Embarked"] == "S","Embarked"]=1
test_data.loc[test_data["Embarked"] == "C","Embarked"]=2

***FINDING THE CORELATION BETWEEN THE INDEPENDENT AND DEPENEDENT VARIABLE(S)***

In [7]:
cor=train_data.corr()
cor['Survived'].sort_values(ascending=False)

Survived       1.000000
Sex            0.543351
Fare           0.257307
Embarked       0.125953
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.064910
Pclass        -0.338481
Name: Survived, dtype: float64

***DATA FEATURING***

In [8]:
features=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
y=train_data['Survived']
X=pd.get_dummies(train_data[features])
X_test=pd.get_dummies(test_data[features])

***LOGISTIC REGRESSION***

In [9]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(X,y)
log_pred=log_reg.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': log_pred})
output.to_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\logistic.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [10]:
from sklearn.model_selection import cross_val_score

scores_log = cross_val_score(log_reg, X, y, cv=10)
print('Cross-Validation Accuracy Score',scores_log.mean())

Cross-Validation Accuracy Score 0.7913188060379073


***NAIVE BAIYES***

In [11]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB() 
gnb.fit(X,y) 
gnb_pred = gnb.predict(X_test) 
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': gnb_pred})
output.to_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\naive_baiyes.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [12]:
scores_gnb = cross_val_score(gnb, X, y, cv=10)
print('Cross-Validation Accuracy Score',scores_gnb.mean())

Cross-Validation Accuracy Score 0.7812308478038815


 ***KNN***

In [13]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors=7) 
knn.fit(X,y)
knn_pred=knn.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': knn_pred})
output.to_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\knn.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [14]:
scores_knn = cross_val_score(knn, X, y, cv=10)
print('Cross-Validation Accuracy Score',scores_knn.mean())

Cross-Validation Accuracy Score 0.7139893315174214


***DECISION TREE***

In [15]:
from sklearn.tree import DecisionTreeClassifier
dst=DecisionTreeClassifier()
dst.fit(X,y)
dst_pred=dst.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': dst_pred})
output.to_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\decsion_tree.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [16]:
scores_dst = cross_val_score(dst, X, y, cv=10)
print('Cross-Validation Accuracy Score',scores_dst.mean())

Cross-Validation Accuracy Score 0.7991973101804563


***RANDOM FOREST***

In [17]:
from sklearn.ensemble import RandomForestClassifier
rmf=RandomForestClassifier(n_estimators=100)
rmf.fit(X,y)
rmf_pred=rmf.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': rmf_pred})
output.to_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\random_forest.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [18]:
scores_rmf = cross_val_score(rmf, X, y, cv=10)
print('Cross-Validation Accuracy Score',scores_rmf.mean())

Cross-Validation Accuracy Score 0.8182734649869481


***SVM***

In [19]:
from sklearn import svm
sv=svm.SVC(kernel='linear')
sv.fit(X,y)
sv_pred=sv.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': sv_pred})
output.to_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\svm.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [20]:
scores_svm = cross_val_score(sv, X, y, cv=10)
print('Cross-Validation Accuracy Score',scores_svm.mean())

Cross-Validation Accuracy Score 0.7866981613891727


***ENSEMBLE MODEL USING DECISION TREE & BAGGING CLASSIFIER***

In [21]:
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
kfold = model_selection.KFold(n_splits=10, random_state=7)
num_trees = 100
model = BaggingClassifier(base_estimator=dst, n_estimators=num_trees, random_state=7)
model.fit(X,y)
dstbc_pred=model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': dstbc_pred})
output.to_csv(r'C:\Users\spars\OneDrive\Desktop\titanic\dstbc.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [22]:
scores_dstbc = cross_val_score(model, X, y, cv=10)
print('Cross-Validation Accuracy Score',scores_dstbc.mean())

Cross-Validation Accuracy Score 0.8183109181704686
