# EDA with Python (Youtube)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [25]:
# Lets read the data from the csv file 
path = 'DATA/titanic.csv'
train = pd.read_csv(path)

In [None]:
# To be sure lets check the first records 
train.head()

## Lets begind the EDA

### 1) Missing data

In [None]:
train.isnull()

In [None]:
# Well it is not really obvious how many missing values do we have...
# Lets see a more visual approach
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
# From this it seems, that in case of Age, may can substitute somehow but the Cabin data is hopeless... 

In [None]:
# Now lets see the survival rate
sns.set_style('darkgrid')
sns.countplot(x='Survived',data=train)

In [None]:
# From ti it seems, that the average rate for survival was about 35%-38%.
# Can we see this divided by gender?
sns.countplot(x='Survived', hue='Sex',data=train,palette='RdBu_r')

In [None]:
# Interesting, this sows, that the sex has a way influence on survival rate!
# Lets see if the passanger class, has the same infulence or not!
sns.countplot(x='Survived', hue='Pclass',data=train,palette='rainbow')

In [None]:
# Hm.. based on this it seems, that if you were a first class passanger you had 66-68%% percent chance for survival, 
# but if you were 2dn class passanger you had only 48%-50%, and at the end if you were a 3rd class passanger,
# you had only about 20%...


In [None]:
# Lets see a displot for he age, to see the deviaion in the age of passengers:
sns.displot(train['Age'].dropna(),kde=True,color='#151599',bins=40)

In [None]:
# Lets see a countplot for the sbilnig and spouse
sns.countplot(x='SibSp',data=train)

In [None]:
# This I did not catch what is up to ...
train['Fare'].hist(color='#990099',bins=40,figsize=(8,4))

## Data cleaning

With a boxplot it is easy to see averages, and outliers as well!
Lets see one!

In [None]:
plt.figure(figsize=(12,7))
sns.boxplot(x='Pclass',y='Age',data=train,palette='winter')

In [None]:
#From this we can easily see the average age for passanger classes :)
# 1) - 37
# 2) - 31
# 3) - 27

In [26]:
# Lets cerate a function which will substitute the missing Age vlues with the averages
def input_age(col):
    age = col[0]
    pClass = col[1]
    if(pd.isnull(age)):
        if pClass == 1:
            return 37
        elif pClass == 2:
            return 31
        else:
            return 27
    else:
        return age


In [27]:
# Now apply thid function to our dataset
train['Age'] = train[['Age','Pclass']].apply(input_age,axis=1)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [28]:
# In case of cabin the best option to drop it
train.drop('Cabin',axis=1,inplace=True)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

## CONVERTING CATEGORICAL FATURES

In [29]:
# Lets see if we could substitue the Embarked categories with 0-1 values
# embarked = pd.get_dummies(train['Embarked'])
train['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [33]:
embark=pd.get_dummies(train['Embarked'],drop_first=True)
embark.shape
embark.isna().sum()

Q    0
S    0
dtype: int64

In [34]:
# Lets see the same for sex:
sex = pd.get_dummies(train['Sex'],drop_first=True)

In [35]:
# Now lets drop all the column which are not required: Sex,Embarked,Name,Ticket
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [36]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.25
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.925
3,4,1,1,35.0,1,0,53.1
4,5,0,3,35.0,0,0,8.05


In [37]:
# Now we can add back the "categorified" elemenst
train=pd.concat([train,sex,embark],axis=1)

In [38]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
886,887,0,2,27.0,0,0,13.0,1,0,1
887,888,1,1,19.0,0,0,30.0,0,0,1
888,889,0,3,27.0,1,2,23.45,0,0,1
889,890,1,1,26.0,0,0,30.0,1,0,0
890,891,0,3,32.0,0,0,7.75,1,1,0


## Building a Logistic Regression model

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train,X_test,y_train,y_test = train_test_split(train.drop('Survived',axis=1),train['Survived'],
                                                 test_size=0.30,random_state=101)


In [41]:
X_train

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,male,Q,S
520,521,1,30.0,0,0,93.5000,0,0,1
510,511,3,29.0,0,0,7.7500,1,1,0
446,447,2,13.0,0,1,19.5000,0,0,1
2,3,3,26.0,0,0,7.9250,0,0,1
691,692,3,4.0,0,1,13.4167,0,0,0
...,...,...,...,...,...,...,...,...,...
575,576,3,19.0,0,0,14.5000,1,0,1
838,839,3,32.0,0,0,56.4958,1,0,1
337,338,1,41.0,0,0,134.5000,0,0,0
523,524,1,44.0,0,1,57.9792,0,0,0


In [48]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(solver='liblinear')
logmodel.fit(X_train,y_train)

LogisticRegression(solver='liblinear')

In [49]:
predictions = logmodel.predict(X_test)

In [50]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


In [51]:
accuracy = accuracy_score(y_test,predictions)

In [52]:
accuracy

0.7723880597014925

In [54]:
cnfMrx = confusion_matrix(y_test,predictions)
cnfMrx

array([[134,  20],
       [ 41,  73]])