In [None]:
import pandas as pd
import numpy as np

In [None]:
raw_csv_data = pd.read_csv('../input/heart-disease-prediction-using-logistic-regression/framingham.csv')

In [None]:
raw_csv_data

In [None]:
df = raw_csv_data.copy()

In [None]:
df.head(5)

In [None]:
 ## Diplaying all the columns, None means set no maximum value
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
## display(raw_csv_data)

In [None]:
## display(df)

In [None]:
df.info()

In [None]:
missing_values_count = df.isnull().sum() ##same as df.info but instead of giving the all info, we can see just missing values

In [None]:
missing_values_count

## Dealing with missing values

In [None]:
##Finding how many percent of the data is missing?

In [None]:
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

# percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
percent_missing

In [None]:
## Decide why the data is missing? It wasn't recorded or it doesn't exist?

In [None]:
percent_missing_edu = (df['education'].isnull().sum() / df.shape[0]) *100
percent_missing_edu

In [None]:
## Based on the missing education calculation only 2.5% of the education column is  missing and gueesing the people's education level from the medical reports are almost impossible
## So, I will drop the missing education data
edu_missing = df[df['education'].isnull()].index
edu_missing

In [None]:
df = df.drop(edu_missing)
df.isnull().sum()

In [None]:
## cigsPerDay can be both, so I will check if the same index of the missing data current smoker or not,
## if not null data will fill with 0, or median

In [None]:
cigarette_index = df[df['cigsPerDay'].isnull()].index
cigarette_index

In [None]:
## I will check if every missing value from cigsPerDay are actually currentSmoker?
current_smoke_status = []
for i in cigarette_index:
   current_smoke_status.append(df['currentSmoker'][i])

In [None]:
current_smoke_status  


In [None]:
smokers = df[df['currentSmoker'] == 1].index
smokers

I will create a cigarettes array using smokers indeces. So, I will get the median only from smokers (almost half of the participants are non smokers, reduces the mean( Median turns 0 without checking only smokers)

In [None]:
cigarettes_by_smokers = []
for i in smokers:
     if df['cigsPerDay'][i] != 'nan':
        cigarettes_by_smokers.append(df['cigsPerDay'][i])

In [None]:
len(cigarettes_by_smokers)

In [None]:
import statistics

In [None]:
smoker_median = statistics.median(cigarettes_by_smokers)
smoker_median

In [None]:
## All of the missing values in cigsPerDay actually current smokers so, i will replace missing values with mean
df['cigsPerDay'] = df['cigsPerDay'].fillna(smoker_median)

When I only use smoker_mean = round(df['cigsPerDay'].mean()) result was 9, because it was including non-smokers as well. I think this one makes more sense

In [None]:
df.isnull().sum()

In [None]:
## BPMed missing values: I made some research on Google, so if your blood pressure is higher than 140-90 
## Doctors are recommending to take BPMed. So, I will check if sysBP is higher than 140 and/or diaBP is higher 
## than 90, if so I will switch NaN values to 1 or 0

BP_missing_index = df[df['BPMeds'].isnull()].index
BP_missing_index

In [None]:
for i in BP_missing_index:
    if ( df['sysBP'][i] > 140 or df['diaBP'][i] > 90 ):
        df.loc[i,'BPMeds'] = 1.0  
    else:
        df.loc[i,'BPMeds'] = 0.0

In [None]:
df.isnull().sum()

### Checkpoint 1

In [None]:
df_1 = df.copy()

In [None]:
df_1.head()

In [None]:
## I will going fill rest of the NaN value with mean values

In [None]:
df_1['totChol'] = df_1['totChol'].fillna(round(df_1['totChol'].mean()))

In [None]:
df_1['BMI'] = df_1['BMI'].fillna(df_1['BMI'].mean())

In [None]:
df_1['glucose'] = df_1['glucose'].fillna(round(df_1['glucose'].mean()))

In [None]:
## There is only one missing value in heart rate, I will use bfill method for replacing NA value
## will bfill it replaces the value that comes directly after it in the same column

df_1['heartRate'] = df_1['heartRate'].fillna(method='bfill', axis=0)

In [None]:
df_1.isnull().sum()

In [None]:
df_1.head(10)

### Checkpoint 2

In [None]:
## There is no NA value in the dataset, so I can save it!
df_2 = df_1.copy()

In [None]:
df_2["education"].unique()

In [None]:
df_2["education"].value_counts()

1 - Less than High School Education
2 - High School
3 - Undergraduate Degree
4 - Graduate Degree

In [None]:
## I will re-group them 0: Less than High School and High School degrees, 1: College Degree and Higher

df_2["education"] = df_2["education"].map({1.0:0, 2.0:0, 3.0:1, 4.0:1})

In [None]:
df_2["education"].unique()

In [None]:
df_2["education"].value_counts()

In [None]:
df_2.isnull().sum()

### Checkpoint 3

In [None]:
df_preprocessed = df_2.copy()

In [None]:
df_preprocessed.head(10)

In [None]:
df_preprocessed.isnull().sum()

In [None]:
df_preprocessed.to_csv('CHD_preprocessed.csv', index=False)