In [1]:
#Load Libraries
import numpy as np
import pandas as pd

In [2]:
data  = pd.read_csv("../Data/5_6323146441963343226.csv")

In [3]:
data.head(3)

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,HeartDisease
0,16.6,Yes,No,No,3,30,No,Female,55-59,White,Yes,Yes,Very good,5,Yes,No,Yes,No
1,20.34,No,No,Yes,0,0,No,Female,80 or older,White,No,Yes,Very good,7,No,No,No,No
2,26.58,Yes,No,No,20,30,No,Male,65-69,White,Yes,Yes,Fair,8,Yes,No,No,No


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   BMI               319795 non-null  float64
 1   Smoking           319795 non-null  object 
 2   AlcoholDrinking   319795 non-null  object 
 3   Stroke            319795 non-null  object 
 4   PhysicalHealth    319795 non-null  int64  
 5   MentalHealth      319795 non-null  int64  
 6   DiffWalking       319795 non-null  object 
 7   Sex               319795 non-null  object 
 8   AgeCategory       319795 non-null  object 
 9   Race              319795 non-null  object 
 10  Diabetic          319795 non-null  object 
 11  PhysicalActivity  319795 non-null  object 
 12  GenHealth         319795 non-null  object 
 13  SleepTime         319795 non-null  int64  
 14  Asthma            319795 non-null  object 
 15  KidneyDisease     319795 non-null  object 
 16  SkinCancer        31

In [5]:
#check null values
data.isnull().sum()

BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
HeartDisease        0
dtype: int64

In [6]:
positive = data[data['HeartDisease'] == 'Yes']
print("Positive samples are", len(positive))

negative = data[data['HeartDisease'] == 'No']
print("Positive samples are", len(negative))

Positive samples are 27373
Positive samples are 292422


In [7]:
#Duplicate check
duplicate = data[data.duplicated()]
print("Duplicate ", len(duplicate))

Duplicate  18078


In [8]:
#Remove duplicates
data = data.drop_duplicates()
print("After Duplicate Removal", len(data))

After Duplicate Removal 301717


In [9]:
print("Race Values")
print(data.Race.unique())

print("\nGenHealth")
print(data.GenHealth.unique())

print("\nAgeCategory")
print(data.AgeCategory.unique())


print("\nDiabetic")
print(data.Diabetic.unique())

Race Values
['White' 'Black' 'Asian' 'American Indian/Alaskan Native' 'Other'
 'Hispanic']

GenHealth
['Very good' 'Fair' 'Good' 'Poor' 'Excellent']

AgeCategory
['55-59' '80 or older' '65-69' '75-79' '40-44' '70-74' '60-64' '50-54'
 '45-49' '18-24' '35-39' '30-34' '25-29']

Diabetic
['Yes' 'No' 'No, borderline diabetes' 'Yes (during pregnancy)']


In [10]:
#Apply label Encoding for the ordinal feaures
from sklearn import preprocessing 

label_encoder = preprocessing.LabelEncoder() 
data['Race']        = label_encoder.fit_transform(data['Race']) 
data['GenHealth']   = label_encoder.fit_transform(data['GenHealth']) 
data['AgeCategory'] = label_encoder.fit_transform(data['AgeCategory']) 
data['Diabetic'] = label_encoder.fit_transform(data['Diabetic']) 

In [11]:
#Convert binary data to 0's and 1's
data['Smoking'] = data['Smoking'].map( {'No':0, 'Yes':1})
data['AlcoholDrinking'] = data['AlcoholDrinking'].map( {'No':0, 'Yes':1})
data['Stroke'] = data['Stroke'].map( {'No':0, 'Yes':1})
data['DiffWalking'] = data['DiffWalking'].map( {'No':0, 'Yes':1})
data['Sex'] = data['Sex'].map( {'Female':0, 'Male':1})
data['Asthma'] = data['Asthma'].map( {'No':0, 'Yes':1})
data['KidneyDisease'] = data['KidneyDisease'].map( {'No':0, 'Yes':1})
data['SkinCancer'] = data['SkinCancer'].map( {'No':0, 'Yes':1})
data['HeartDisease'] = data['HeartDisease'].map( {'No':0, 'Yes':1})
data['PhysicalActivity'] = data['PhysicalActivity'].map( {'No':0, 'Yes':1})

In [12]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [13]:
X = data.iloc[:,0:17]  #independent columns
y = data.iloc[:,-1]    #target column i.e price range

In [14]:
X.head(3)

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,16.6,1,0,0,3,30,0,0,7,5,2,1,4,5,1,0,1
1,20.34,0,0,1,0,0,0,0,12,5,0,1,4,7,0,0,0
2,26.58,1,0,0,20,30,0,1,9,5,2,1,1,8,1,0,0


In [15]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

             Specs          Score
4   PhysicalHealth  152818.088530
8      AgeCategory   34092.037888
10        Diabetic   13681.199261
3           Stroke   10976.246127
6      DiffWalking    9929.230236
15   KidneyDisease    5901.802646
16      SkinCancer    2238.359214
5     MentalHealth    2115.303522
1          Smoking    1904.392749
0              BMI     991.250447


In [16]:
feat = ['PhysicalHealth', 'AgeCategory', 'Diabetic', 'Stroke', 'DiffWalking', 'KidneyDisease', 'MentalHealth', 'SkinCancer','Smoking', 'BMI', 'HeartDisease']
features = data[feat]

In [17]:
features.to_csv('../Data/heart_disease_training_data.csv')