# Missing value handling

In [1]:
import pandas as pd
import numpy as np

# titanic dataset
df = pd.read_csv('train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [2]:
# Total missing values
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [3]:
# Percentage of missing 
(df.isnull().sum() / len(df))*100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Gender          0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [4]:
#df["Age"].fillna(df["Age"].mean(),inplace = True)
#df

In [5]:
# index of missing values in Age col
age_missing_index = df[df['Age'].isnull()].index.tolist()

In [6]:
train_1 = df.copy()
train_1["Age"].fillna(30,inplace = True)
train_1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
train_1.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,30.0,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,30.0,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,30.0,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,30.0,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,30.0,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,30.0,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,30.0,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,30.0,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,30.0,0,0,349217,7.8958,,S


In [8]:
# imputing with a constant
from sklearn.impute import SimpleImputer
train_constant = df.copy()

#setting strategy to 'constant' 
cons_imputer = SimpleImputer(strategy='constant', fill_value= -1) # imputing using constant value
train_constant.iloc[:,:] = cons_imputer.fit_transform(train_constant)

# missing value count
train_constant.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Gender         0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [9]:
train_constant.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,-1.0,0,0,330877,8.4583,-1,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,-1.0,0,0,244373,13.0000,-1,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,-1.0,0,0,2649,7.2250,-1,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,-1.0,0,0,2631,7.2250,-1,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,-1.0,0,0,330959,7.8792,-1,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,-1.0,0,0,2629,7.2292,-1,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,-1.0,8,2,CA. 2343,69.5500,-1,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,-1.0,0,0,345777,9.5000,-1,S
878,879,0,3,"Laleff, Mr. Kristo",male,-1.0,0,0,349217,7.8958,-1,S


In [10]:
train = df.copy()
#setting strategy to 'mean' to impute by the mean
imputer = SimpleImputer(strategy='mean')
train['Age'] = imputer.fit_transform(train[['Age']])

# missing value check
train.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,29.699118,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,29.699118,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,29.699118,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,29.699118,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,29.699118,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,29.699118,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,29.699118,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,29.699118,0,0,349217,7.8958,,S


In [11]:
train = df.copy()

imputer = SimpleImputer(strategy='median')
train['Age'] = imputer.fit_transform(train[['Age']])
train.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,28.0,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,28.0,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,28.0,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,28.0,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,28.0,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,28.0,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,28.0,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,28.0,0,0,349217,7.8958,,S


In [12]:
train_most_frequent = df.copy()
# it uses Mode
mode_imputer = SimpleImputer(strategy='most_frequent')
train_most_frequent.iloc[:,:] = mode_imputer.fit_transform(train_most_frequent)

# check filled data
train_most_frequent.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,24.0,0,0,330877,8.4583,B96 B98,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,24.0,0,0,244373,13.0000,B96 B98,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,24.0,0,0,2649,7.2250,B96 B98,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,24.0,0,0,2631,7.2250,B96 B98,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,24.0,0,0,330959,7.8792,B96 B98,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,24.0,0,0,2629,7.2292,B96 B98,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,24.0,8,2,CA. 2343,69.5500,B96 B98,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,24.0,0,0,345777,9.5000,B96 B98,S
878,879,0,3,"Laleff, Mr. Kristo",male,24.0,0,0,349217,7.8958,B96 B98,S


In [13]:
train = df.copy()

# groupby category
train["Age"] = train.groupby("Gender")['Age'].transform(lambda x: x.fillna(x.mean()))
train.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,30.726645,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,30.726645,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,27.915709,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,30.726645,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,27.915709,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,30.726645,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,27.915709,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,30.726645,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,30.726645,0,0,349217,7.8958,,S


# Feature Selection

In [14]:
import pandas as pd
import numpy as np

# titanic dataset
dff = pd.read_csv('heart.csv')
dff

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [15]:
dff.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [16]:
x = dff.drop('target',axis=1)
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [17]:
y = dff['target']
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [18]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [19]:
fit_features = SelectKBest(score_func = f_classif)
fit_features.fit(x,y)

In [20]:
fs = pd.DataFrame(fit_features.scores_,index=x.columns, columns = ['score values'])
fs

Unnamed: 0,score values
age,16.1167
sex,25.792191
cp,69.772271
trestbps,6.458169
chol,2.202983
fbs,0.236942
restecg,5.777209
thalach,65.120104
exang,70.952438
oldpeak,68.551439


In [21]:
fs.nlargest(9,'score values')

Unnamed: 0,score values
exang,70.952438
cp,69.772271
oldpeak,68.551439
thalach,65.120104
ca,54.559834
slope,40.902071
thal,40.407696
sex,25.792191
age,16.1167


In [22]:
from sklearn.ensemble import ExtraTreesClassifier

In [23]:
model = ExtraTreesClassifier()
model.fit(x,y)

In [24]:
fs = pd.DataFrame(model.feature_importances_,index=x.columns, columns = ['score values'])
fs

Unnamed: 0,score values
age,0.065765
sex,0.058813
cp,0.128842
trestbps,0.063463
chol,0.061791
fbs,0.021343
restecg,0.03474
thalach,0.096382
exang,0.097471
oldpeak,0.089821


In [25]:
fs.nlargest(9,'score values')

Unnamed: 0,score values
cp,0.128842
ca,0.120695
exang,0.097471
thalach,0.096382
thal,0.094972
oldpeak,0.089821
slope,0.065901
age,0.065765
trestbps,0.063463
