Importing Necessary Libraries.

In [None]:
#data analysis libraries 
import numpy as np
import pandas as pd

#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

Now Read and Explore the data by using panda library function- read_csv() and explore()

In [None]:
test = pd.read_csv("../input/titanic/test.csv")
train = pd.read_csv("../input/titanic/train.csv")
# preview the data
train.head()
# check the number of missing values and other information in the feature
#train.describe(include="all")

In [None]:
# check the number of missing values and other information in the feature
train.describe(include="all")

In [None]:
# Check the Information of Train and Test Data
train.info()
print("----------------------------")
test.info()

**Selecting Features and Data Engineering for Missing values in feature  :** Among the features 19.8% data is missing in age feature. 77.1% data are missing in cabin feature and 0.22% data is missing in embarked  feature. We will drop cabin feature considering it will be hard/incorrect to fill up this amount of missing data. In embarked feature we will fill the missing value by most appeared value in this feature. Also we will drop feature passenger id, Name and ticket considering as irreleveant feauture. 

In [None]:
# dropping unnecessary columns
train = train.drop(['Cabin'], axis = 1)
test = test.drop(['Cabin'], axis = 1)



In [None]:
# Check the Information of Train and Test Data
train.info()
print("----------------------------")
test.info()

In [None]:
# dropping unnecessary columns
train = train.drop(['Name'], axis = 1)
test = test.drop(['Name'], axis = 1)
train = train.drop(['Ticket'], axis = 1)
test = test.drop(['Ticket'], axis = 1)
train = train.drop(['PassengerId'], axis = 1)
test = test.drop(['PassengerId'], axis = 1)

In [None]:
# Check the Information of Train and Test Data
train.info()
print("----------------------------")
test.info()

Now we will fill up the Embarked feature with the most occured value which is "S" and We will fill up the missing age value by generating random number within standard deviation from mean value in the age feature. 

In [None]:
train["Embarked"] = train["Embarked"].fillna("S")

In [None]:
# Check the Information of Train and Test Data
train.info()
print("----------------------------")
test.info()

In [None]:
# get average, std, and number of NaN values in titanic_df
average_age_train   = train["Age"].mean()
std_age_train       = train["Age"].std()
count_nan_train = train["Age"].isnull().sum()

# get average, std, and number of NaN values in test_df
average_age_test   = test["Age"].mean()
std_age_test       = test["Age"].std()
count_nan_test = test["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_train - std_age_train, average_age_train + std_age_train, size = count_nan_train)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_test)

print(average_age_train)
#train["Age"]=train["Age"].dropna()
# fill NaN values in Age column with random values generated
train["Age"][np.isnan(train["Age"])] = rand_1
test["Age"][np.isnan(test["Age"])] = rand_2

# convert from float to int
train['Age'] = train['Age'].astype(int)
test['Age']    = test['Age'].astype(int)


In [None]:
# Check the Information of Train and Test Data
train.info()
print("----------------------------")
test.info()

In [None]:
# Family

# Instead of having two columns Parch & SibSp, 
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.

train['Family'] =  train["Parch"] + train["SibSp"]
train['Family'].loc[train['Family'] > 0] = 1
train['Family'].loc[train['Family'] == 0] = 0

test['Family'] =  test["Parch"] + test["SibSp"]
test['Family'].loc[test['Family'] > 0] = 1
test['Family'].loc[test['Family'] == 0] = 0

#titanic_df = titanic_df.drop(['SibSp','Parch'], axis=1)
#test_df    = test_df.drop(['SibSp','Parch'], axis=1)


In [None]:
# Check the Information of Train and Test Data
train.info()
print("----------------------------")
test.info()

**Data Visualization:**
Analysis of features by using data Visualization

In [None]:
#draw a bar plot of survival by sex
sns.barplot(x="Sex", y="Survived", data=train)

#print percentages of females vs. males that survive
print("Percentage of females who survived:", train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)[1]*100)

print("Percentage of males who survived:", train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)[1]*100)

In [None]:
#draw a bar plot of survival by Embarked station
sns.barplot(x="Embarked", y="Survived", data=train)

In [None]:
# Draw a histogram plot of 
train['Age'].hist(bins=70)

In [None]:
#draw a bar plot of survival by Family
sns.barplot(x="Family", y="Survived", data=train)

**Managing the Categorical data**

In [None]:
labelencoder_train_sex= LabelEncoder()
train['Sex']= labelencoder_train_sex.fit_transform(train['Sex'])
labelencoder_test_sex= LabelEncoder()
test['Sex']= labelencoder_test_sex.fit_transform(test['Sex'])

labelencoder_train_embarked= LabelEncoder()
train['Embarked']= labelencoder_train_embarked.fit_transform(train['Embarked'])
labelencoder_test_embarked= LabelEncoder()
test['Embarked']= labelencoder_test_embarked.fit_transform(test['Embarked'])




In [None]:
# Check the Information of Train and Test Data
train.info()
print("----------------------------")
test.info()

In [None]:
average_Fare_test   = test["Fare"].mean()
test["Fare"][np.isnan(test["Fare"])] = average_Fare_test

# convert Fare from float to int
train['Fare'] = train['Fare'].astype(int)
test['Fare']    = test['Fare'].astype(int)


In [None]:
# Check the Information of Train and Test Data
train.info()
print("----------------------------")
test.info()

**Classifier Comparison**

In [None]:
# define training and testing sets

X_train = train.drop("Survived",axis=1)
Y_train = train["Survived"]
X_test  = test

In [None]:
# Logistic Regression

logreg = LogisticRegression()

logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

logreg.score(X_train, Y_train)

In [None]:
# Support Vector Machines

 svc = SVC()

 svc.fit(X_train, Y_train)

 Y_pred = svc.predict(X_test)

 svc.score(X_train, Y_train)

In [None]:
# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

In [None]:
 knn = KNeighborsClassifier(n_neighbors = 3)

 knn.fit(X_train, Y_train)

 Y_pred = knn.predict(X_test)

 knn.score(X_train, Y_train)

In [None]:
# Gaussian Naive Bayes

 gaussian = GaussianNB()

 gaussian.fit(X_train, Y_train)

 Y_pred = gaussian.predict(X_test)

 gaussian.score(X_train, Y_train)

               **Classifier** 	  **Accuracy**
        Logistic Regression	       80.24%
               SVM	               90.12%
          Random Forest	           96.96%
       K Nearest Neighbour	       84.06%
       Gaussian Naïve bayes	       79.01%


In [None]:
# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)



In [None]:
submission = pd.DataFrame({
        
        "Survived": Y_pred
    })
submission.to_csv('titanic.csv', index=False)