# K- fold cross validation technique

##  .Titanic Dataset

# 1 import the library

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 2.Load the data

In [4]:
titanic_data=pd.read_csv(r'titanic-1.csv')

In [5]:
titanic_data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
775,776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18.0,0,0,347078,7.75,,S
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S
582,583,0,2,"Downton, Mr. William James",male,54.0,0,0,28403,26.0,,S
693,694,0,3,"Saad, Mr. Khalil",male,25.0,0,0,2672,7.225,,C
662,663,0,1,"Colley, Mr. Edward Pomeroy",male,47.0,0,0,5727,25.5875,E58,S


# 3.Data Preprocessing

## Finding missing values

In [6]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
titanic_data.shape

(891, 12)

## Handle the missing values

### Rule no. 1:if no. of rows of missing  value is greater than 50%, then drop the column

In [8]:
print('Cabin row missing value percentage is ',(687/891)*100)

Cabin row missing value percentage is  77.10437710437711


In [9]:
for col_name in titanic_data.columns:
    miss_value = titanic_data[col_name].isnull().sum()/titanic_data.shape[0]
    if miss_value>0:
        print(f'{col_name} and missing value is {miss_value*100:.2f}%')

Age and missing value is 19.87%
Cabin and missing value is 77.10%
Embarked and missing value is 0.22%


In [10]:
# Drop the cabin column - missing value
# Drop : PassengerID, Name and Ticket - Unique columns

In [11]:
titanic_data=titanic_data.drop(columns=['Cabin','PassengerId','Name','Ticket'],axis=1)

In [12]:
titanic_data.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S


### Rule 2:Handle Missing value(impute)

In [13]:
from sklearn.impute import SimpleImputer

In [14]:
# Numerical value

In [15]:
age_imputer=SimpleImputer(strategy='mean')

In [16]:
titanic_data['Age']= age_imputer.fit_transform(titanic_data[["Age"]])

In [17]:
titanic_data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

In [18]:
titanic_data.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
label_encoder = LabelEncoder()

In [21]:
titanic_data['Sex'] = label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked']= label_encoder.fit_transform(titanic_data['Embarked'])

In [22]:
titanic_data.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2


In [23]:
# Fill the embarked column with most frequent(mode)

In [24]:
embarked_imputer = SimpleImputer(strategy='most_frequent')

In [25]:
titanic_data['Embarked'] = embarked_imputer.fit_transform(titanic_data[['Embarked']])

In [26]:
titanic_data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

# Find the Independent Varible and dependent avariable

In [27]:
X = titanic_data.drop(columns=['Survived'],axis=1 )
y = titanic_data['Survived']

# Cross  validation technique

In [28]:
from sklearn.tree import DecisionTreeClassifier

In [29]:
model_dt = DecisionTreeClassifier()

In [30]:
from sklearn.model_selection import cross_val_score

In [31]:
# fold
k=5

In [32]:
# perform K-fold cross validation
cv_scores = cross_val_score(model_dt,X,y,cv=k,verbose=2)

[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s


# Interpretation result

In [33]:
for i in range(k):
    print(f'{i+1} Fold,cross validation accuracy is {cv_scores[i]}' )

1 Fold,cross validation accuracy is 0.7653631284916201
2 Fold,cross validation accuracy is 0.7808988764044944
3 Fold,cross validation accuracy is 0.8033707865168539
4 Fold,cross validation accuracy is 0.7415730337078652
5 Fold,cross validation accuracy is 0.8089887640449438


In [34]:
print(f'Mean Accuracy is {cv_scores.mean():.2f}')

Mean Accuracy is 0.78


In [35]:
print(f'Standard Deviation: {cv_scores.std():.2f}')

Standard Deviation: 0.02
