In [346]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

train_df = pd.read_csv('train.csv') #Read training datadet in dataframe
test_df = pd.read_csv('test.csv')   #Read testing dataset in dataframe

train_df.info()                    #Get the infomation about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [347]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [348]:
#Drop the columns which are not useful for analysis from both training and testing dataframes
train_df = train_df.drop(['PassengerId','Name','Ticket','Embarked','Cabin'], axis=1)
test_df = test_df.drop(['Name','Ticket','Embarked','Cabin'], axis=1)

###### Question: If you have excluded any attributes from the analysis, provide an explanation why you believe they can be excluded.
Answer: From the training dataset PassengerId, name, ticket and embarked attributes are dropped because there is no relation with 'Survived' attribute. Attribute 'Cabin' was dropped because it has many null values, out of 891 it has only 204 non-null values. 

In [349]:
#Family
#Engineer the siblings (SibSp) and parents(Parch) column to check if having family impacts the survival rate or not.
#Creating a new column named HasFamily having value of 1 if YES and 0 if NO
pd.options.mode.chained_assignment = None
train_df['HasFamily'] =  train_df["Parch"] + train_df["SibSp"]
test_df['HasFamily'] =  test_df["Parch"] + test_df["SibSp"]

train_df['HasFamily'].loc[train_df['HasFamily'] > 0] = 1
train_df['HasFamily'].loc[train_df['HasFamily'] == 0] = 0

test_df['HasFamily'].loc[test_df['HasFamily'] > 0] = 1
test_df['HasFamily'].loc[test_df['HasFamily'] == 0] = 0

#Drop the siblings (SibSp) and parents(Parch) column
train_df = train_df.drop(['SibSp','Parch'], axis=1)
test_df = test_df.drop(['SibSp','Parch'], axis=1)

#Calculating the average of survived those who has family on board or not
#Group by on attribute 'HasFamily' and taking the mean on 'Survived'  
Avg_Family_Survived = train_df.groupby('HasFamily', as_index=False)['Survived'].mean()
Avg_Family_Survived

Unnamed: 0,HasFamily,Survived
0,0,0.303538
1,1,0.50565


###### Result Analysis: Person with family is more likely to survive than alone because of the higher average survival rate.

In [350]:
#Gender
#changing the value to numeric, having value of 1 for female and 0 for male
train_df.loc[train_df['Sex']=='male','Sex'] = 1
train_df.loc[train_df['Sex']=='female','Sex'] = 0
train_df.loc[train_df['Age'] < 15,'Sex'] = 2

test_df.loc[test_df['Sex']=='male','Sex'] = 1
test_df.loc[test_df['Sex']=='female','Sex'] = 0
test_df.loc[test_df['Age'] < 15,'Sex'] = 2

#Calculating the average of survived males and females 
#Group by on attribute 'Sex' and taking the mean on 'Survived'
Avg_Sex_Survived = train_df.groupby('Sex', as_index=False)['Survived'].mean()
Avg_Sex_Survived

Unnamed: 0,Sex,Survived
0,0,0.76
1,1,0.163569
2,2,0.576923


###### Result Analysis: Females and children are more likely to survive than males because of the higher average survival rate.

In [351]:
#Class
#Calculating the average of survived based on the class they were traveling 
#Group by on attribute 'Pclass' and taking the mean on 'Survived'
Avg_Class_Survived = train_df.groupby('Pclass', as_index=False)['Survived'].mean()
Avg_Class_Survived

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


###### Result Analysis: Person travelling in class-1 is more likely to survive than person in class 2 and 3 because of the higher average survival rate.

In [352]:
#Age
#Engineer the age column and creating bins for analysing the train dataset.
#Creating a new column named 'Age_category'

train_df["Age"].fillna(train_df["Age"].median(), inplace=True) #All the null values in age is replaced by median

bins = [0, 15, 30, 45, 60, 85]

train_df['Age_categories'] = pd.cut(train_age['Age'], bins)

#Calculating the average of survived based on the age_categories 
#Group by on attribute 'Age_category' and taking the mean on 'Survived'
Avg_Age_Group_Survived = train_df.groupby('Age_categories', as_index=False)['Survived'].mean()
Avg_Age_Group_Survived

Unnamed: 0,Age_categories,Survived
0,"(0, 15]",0.590361
1,"(15, 30]",0.358896
2,"(30, 45]",0.425743
3,"(45, 60]",0.407407
4,"(60, 85]",0.227273


###### Result Analysis: Children and teenagers less or equal to 15 year of age are more likely to survive than other age groups.

In [353]:
#Fare
#Engineer the fare column and creating bins for analysing the train dataset.
#Creating a new column named 'Fare_category' having value of 1 if YES and 0 if NO
test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)

#convert from float to int
train_df['Fare'] = train_df['Fare'].astype(int)
test_df['Fare'] = test_df['Fare'].astype(int)

bins = [0, 30, 60, 100, 130, 165]

train_df['Fare_categories'] = pd.cut(train_age['Fare'], bins)

#Calculating the average of survived based on the age_categories 
#Group by on attribute 'Fare_category' and taking the mean on 'Survived'
Avg_Fare_Group_Survived = train_df.groupby('Fare_categories', as_index=False)['Survived'].mean()
Avg_Fare_Group_Survived

Unnamed: 0,Fare_categories,Survived
0,"(0, 30]",0.331349
1,"(30, 60]",0.479167
2,"(60, 100]",0.677966
3,"(100, 130]",0.714286
4,"(130, 165]",0.75


###### Result Analysis: Person paid more for tickets are more likely to survive.

In [354]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,HasFamily,Age_categories,Fare_categories
0,0,3,1,22.0,7,1,"(15, 30]","(0, 30]"
1,1,1,0,38.0,71,1,"(30, 45]","(60, 100]"
2,1,3,0,26.0,7,0,"(15, 30]","(0, 30]"
3,1,1,0,35.0,53,1,"(30, 45]","(30, 60]"
4,0,3,1,35.0,8,0,"(30, 45]","(0, 30]"


In [355]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,HasFamily
0,892,3,1,34.5,7,0
1,893,3,0,47.0,7,1
2,894,2,1,62.0,9,0
3,895,3,1,27.0,8,0
4,896,3,0,22.0,12,1


In [356]:
train_df = train_df.dropna()  #Drop the rows with null values in train dataset
test_df = test_df.dropna()    #Drop the rows with null values in test dataset
train_df = train_df.drop(['Fare_categories'], axis=1)
train_df = train_df.drop(['Age_categories'], axis=1)


X_train = train_df.drop('Survived', axis=1)
Y_train = train_df['Survived']
X_test  = test_df.drop("PassengerId", axis=1).copy()


In [357]:
#KNN
#Using inbuild KNN classifier from sklearn
knn = KNeighborsClassifier(n_neighbors = 3) #Call the KNN classifier where number of neighbors are 3
knn.fit(X_train, Y_train)                   #Fit the model using X_train as training data and Y_train as target values
Y_pred = knn.predict(X_test)                #Predict the class labels for the provided test data
knn.score(X_train, Y_train)                 #Returns the mean accuracy on the given test data and labels.

0.82148040638606679

### Additional explaination on analysis

##### Question 1: What categories of passengers were most likely to survive the Titanic disaster?
Answer 1: According to the analysis done on different attributes. It can be observed that: 

(i) Females and children are more likely to survive than males.

(ii) Person with family is more likely to survive than alone because of the higher average survival rate. 

(iii) Person travelling in class-1 is more likely to survive than person in class 2 and 3 because of the higher average survival rate.

(iv) Children and teenagers less or equal to 15 year of age are more likely to survive than other age groups.

(v) Person paid more for tickets are more likely to survive.


##### Question 2: What other attributes did you use for the analysis. Explain how you used them.Provide a complete list of all attributes used.
Answer 2: Other attribute I used are 'Fare', 'Parch', SibSp and 'Pclass'. 

(i) Fare: Divide the values of the attribute in bins and created new attribute named 'Fare_categories'. Group by on attribute 'Fare_category' and taking the mean on 'Survived'. 
(ii) Pclass: Group by on attribute 'Pclass' and taking the mean on 'Survived'. 
(iii)Parch and SibSp: Removed these columns after creating a new column named HasFamily having value of 1 if YES and 0 if NO 


Complete List of atttribute used for analysis: 'Fare', 'Age', 'Parch','Sex', 'SibSp' and 'Pclass'




###### Question 3: Did you engineer any attributes? If yes, explain the rationale and how the new attributes were used in the analysis?

Answer 3: Yes, I engineered attribute 'Parch' and 'SibSp'. First adding the values of both attributes, then if it is greater than 0 that mean the person has family else not. Finally, creating a new attribute named 'Has_Family' with value as 1 if YES and 0 if NO and deleting the attribute 'Parch' and 'SibSp'.

###### Question 4: If you have excluded any attributes from the analysis, provide an explanation why you believe they can be excluded.

Answer 4: Explained above, below In [288]

###### Question 5: How did you treat missing values? Provide a detailed explanation in the comments.
Answer 5: All the null values in age is replaced by median. Also, in the test dataset, all the null values in fare is replaced by median.