In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Step 1: Read the data
df = pd.read_csv(r"E:/Dataset/framingham.csv")
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# Step 2: Data Pre-Processing:
# Creating a subset of data with only input and output variables
df_new = df[['age','cigsPerDay','BPMeds','totChol', 'sysBP','diaBP','TenYearCHD']]
df_new.head()

Unnamed: 0,age,cigsPerDay,BPMeds,totChol,sysBP,diaBP,TenYearCHD
0,39,0.0,0.0,195.0,106.0,70.0,0
1,46,0.0,0.0,250.0,121.0,81.0,0
2,48,20.0,0.0,245.0,127.5,80.0,0
3,61,30.0,0.0,225.0,150.0,95.0,1
4,46,23.0,0.0,285.0,130.0,84.0,0


In [4]:
# Check the missing values in dataset
df_new.isnull().sum() #Sum gives you the missing values present in data

age            0
cigsPerDay    29
BPMeds        53
totChol       50
sysBP          0
diaBP          0
TenYearCHD     0
dtype: int64

In [5]:
# For Numerical column (i.e totChol,cigsPerDay) : Checking skewness value
print(df_new['cigsPerDay'].skew())
print(df_new['totChol'].skew())

1.2470523561848126
0.8718805634765354


In [6]:
# For categorical column (i.e BP meds) : Lets identify the mode
df_new['BPMeds'].value_counts()

0.0    4063
1.0     124
Name: BPMeds, dtype: int64

In [7]:
#fillna() will replace the missing values
df_new['cigsPerDay'] = df_new['cigsPerDay'].fillna(df_new['cigsPerDay'].median())
df_new['totChol'] = df_new['totChol'].fillna(df_new['totChol'].mean())
df_new['BPMeds'] = df_new['BPMeds'].fillna(0)

In [8]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         4240 non-null   int64  
 1   cigsPerDay  4240 non-null   float64
 2   BPMeds      4240 non-null   float64
 3   totChol     4240 non-null   float64
 4   sysBP       4240 non-null   float64
 5   diaBP       4240 non-null   float64
 6   TenYearCHD  4240 non-null   int64  
dtypes: float64(5), int64(2)
memory usage: 232.0 KB


In [9]:
# Step 3 : Defining X and Y
X = df_new[['age','cigsPerDay','BPMeds','totChol','sysBP','diaBP']]
Y = df_new[['TenYearCHD']]

In [10]:
# Step 4: Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size = 0.8, random_state=1234)
 
len(X_train), len(X_test), len(Y_train), len(Y_test)

(3392, 848, 3392, 848)

To check the training data set is balanced or not 

In [14]:
# Lets find the no of data points in training dataset for target variable
print('count of 1 in Y_train is:',sum(Y_train['TenYearCHD']==1))
print('count of 0 in Y_train is:',sum(Y_train['TenYearCHD']==0))

count of 1 in Y_train is: 513
count of 0 in Y_train is: 2879


Our training data points have 2879 observation in the category 0 (persion will not get heart attact in next 10 years),
where as there are 513 data points which are in category 1 (persion will get heart attack in next 10 years)

We must first balance our training dataset, by bringing more datapoints in cat 1

Data balancing methods are : Oversampling and Undersampling 

a. In oversampling method this is applied on minority category data. Method to be followed in SMOTE
b. In undersampling method this is applied on majority category data.

In [15]:
#SMOTE on X_train and Y_train
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: joblib, imbalanced-learn, imblearn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0 joblib-1.2.0


# SMOTE

In [16]:
# Import the functionality of SMOTE

from imblearn.over_sampling import SMOTE

In [17]:
# We will balance the traing data
# 1. We will create SMOTE object

sm = SMOTE(random_state = 1234)

# 2. We will fit the SMOTE object on X_train, and Y_train --> fit.resample

X_train_new, Y_train_new = sm.fit_resample(X_train,Y_train)

In [18]:
# Lets find the new no of data points in training dataset for target variable
print('count of 1 in Y_train_new is:',sum(Y_train_new['TenYearCHD']==1))
print('count of 0 in Y_train_new is:',sum(Y_train_new['TenYearCHD']==0))

count of 1 in Y_train_new is: 2879
count of 0 in Y_train_new is: 2879


In [23]:
# Step 4: Creating the model using the training data set

# step a: Create a model object 
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()

# step b: Fit the model object into training data to build a model
model = LR.fit(X_train_new, Y_train_new)
model

LogisticRegression()

In [28]:
# Step 5: Predict the values on test data using your model
Y_test['predicted_TenyearCHD'] = model.predict(X_test)

In [29]:
Y_test

Unnamed: 0,TenYearCHD,predicted,predicted_TenyearCHD
1226,0,0,0
1011,0,1,1
165,0,1,1
1311,0,1,1
1712,0,0,0
...,...,...,...
2981,1,1,1
374,0,1,1
2014,0,0,0
2010,0,0,0


In [33]:
# Step 6 : Using Confusion Matrix -> Check the accuracy of model
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

confusion_matrix(Y_test['TenYearCHD'],Y_test['predicted_TenyearCHD'])

array([[473, 244],
       [ 49,  82]], dtype=int64)

In [32]:
accuracy_score(Y_test['TenYearCHD'],Y_test['predicted_TenyearCHD']) #True predicton 473 + 82  and False prediction is 244 + 49

0.6544811320754716

Accuracy of model is 65.44%

In [35]:
print(classification_report(Y_test['TenYearCHD'],Y_test['predicted_TenyearCHD']))
# ideal situation recall value should be similiar, Model should not be biased

              precision    recall  f1-score   support

           0       0.91      0.66      0.76       717
           1       0.25      0.63      0.36       131

    accuracy                           0.65       848
   macro avg       0.58      0.64      0.56       848
weighted avg       0.81      0.65      0.70       848



Now we can see that the both recall values are closer to each other so this model is preferrably good as compair to previous 

Even though accuracy droped but recall value is in similar range so the model knows how to classify 1 and 0 

The previous model (logistic regression) accuracy score was 85% but recall vale for classify something was just 7% and
recall value for classifying in category 0 was 100% this mean our model was biased towards classifying everthing as 0 or our
model doesnt know how to classify something as 1

Current model(logistic regression with SMOTE) Even though accuracy droped from 85 to 65 still it is considered as good model
and recall value is in similar range so the model knows how to classify 1 and 0 with similar accuracy. Our model is not biased in nature.