## Import of Libraries

In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

## Reading the Data

Pandas function Usage example  
#### df = pd.read_csv(csv_file_path, sep=';', header=0, index_col='ID', usecols=['ID', 'Name', 'Age'])  
sep: Specifies the delimiter used in the CSV file. The default is a comma (,).  
header: Specifies which row to use as the column names. If set to None, the default column names will be used.  
index_col: Specifies which column to use as the row labels (index).  
usecols: Specifies which columns to read from the file.  


In [3]:
#Reading the csv file 
data1 = pd.read_csv("train.csv",index_col='PassengerId')
print("Data Statistics:",list(data1.columns.values))
# data1.head()
data_test = pd.read_csv("test.csv",index_col='PassengerId')

Data Statistics: ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


## Removing the irrelevant features 

In [4]:
# The variable "Name","Ticket","Fare", "Cabin","Embarked" seemed to be irrelevant
# Hence dropping these columns
columns_to_drop = ["Name","Ticket","Fare","Cabin","Embarked","SibSp","Parch","Pclass"]
data1 = data1.drop(columns=columns_to_drop,axis = 1)
# data1.head()
data_x = data1.drop(["Survived"],axis = 1)
data_y = data1["Survived"]
# data_x.head()
# data_y.head()

data_test_x = data_test.drop(columns=columns_to_drop,axis = 1)
# data1.head()
data_test_y = pd.read_csv("gender_submission.csv",index_col='PassengerId')


## Filling the missing values
Here we will use the mean or mode method to fill the missing values.  
If the values are categorical then we use the mode.  
For the numerical values we use the median.  

In [5]:
# Dataframe using mean of the columns 
# Using the mode for the categorical columns
columns_to_handle = ["Sex"]
data_x[columns_to_handle] = data_x[columns_to_handle].fillna(data_x[columns_to_handle].mode())
data_test_x[columns_to_handle] = data_test_x[columns_to_handle].fillna(data_test_x[columns_to_handle].mode())

# Using mean for the numerical column
columns_to_handle = ["Age"]
data_x[columns_to_handle] = data_x[columns_to_handle].fillna(data_x[columns_to_handle].mean())
data_test_x[columns_to_handle] = data_test_x[columns_to_handle].fillna(data_test_x[columns_to_handle].mean())

# To ensure that we do not have any Nan values in a dataframe
count = data_x.isna().sum()
count_test = data_test_x.isna().sum()
print(count_test)



Sex    0
Age    0
dtype: int64


## Problem Statement 
we need to predict the survival of the Passengers. Hence it is a classification problem.
We will use the  Supervised learning classification methods as below:  
1. Decision Trees
2. Logistic Regression
3. Random Forest

### 1. Decision Trees
for the decision trees we do not need to perfrom any scaling on the data.  
However we will need to do the encoding for categorical variables 

In [6]:
label_encoder = LabelEncoder()
data_x['Sex'] = label_encoder.fit_transform(data_x['Sex'])
data_test_x['Sex'] = label_encoder.fit_transform(data_test_x['Sex'])
# data_x_fill.head()
clf = DecisionTreeClassifier(random_state=0)
clf.fit(data_x,data_y)
predictions = clf.predict(data_test_x)
s = clf.score(data_test_x, data_test_y, sample_weight=None)
print(" Accuracy score:",s)
cm = metrics.confusion_matrix(data_test_y, predictions)
print(cm)

 Accuracy score: 0.8588516746411483
[[256  10]
 [ 49 103]]


### 2. Logistic Regression

In [7]:
logisticRegr = LogisticRegression()
logisticRegr.fit(data_x,data_y)
predictions = logisticRegr.predict(data_test_x)

pred_data = pd.DataFrame({'PassengerId':  data_test_x.index, 'Survived': predictions})
# pred_data.set_index(data_test_x.index)
pred_data.to_csv("submission1.csv",index = False)
print(type(pred_data))
print(pred_data)
score = logisticRegr.score(data_test_x, data_test_y)
print(score)

<class 'pandas.core.frame.DataFrame'>
     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
1.0


In [8]:
cm = metrics.confusion_matrix(data_test_y, predictions)
print(cm)

[[266   0]
 [  0 152]]


### 3. Random Forest Classfier

In [9]:
rf = RandomForestClassifier()
rf.fit(data_x,data_y)
predictions = rf.predict(data_test_x)
s = rf.score(data_test_x, data_test_y, sample_weight=None)
print(" Accuracy score:",s)
cm = metrics.confusion_matrix(data_test_y, predictions)
print(cm)

 Accuracy score: 0.8349282296650717
[[243  23]
 [ 46 106]]
