In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

### Scenario 1: HEART DISEASE PREDICTION USING
    LOGISTIC REGRESSION
    Industry: Healthcare
    
    World Health Organization has estimated 12 million deaths occur worldwide, every year due to Heart
    diseases. Half the deaths in the United States and other developed countries are due to cardio vascular
    diseases. The early prognosis of cardiovascular diseases can aid in making decisions on lifestyle changes in
    high risk patients and in turn reduce the complications. This research intends to pinpoint the most
    relevant/risk factors of heart disease as well as predict the overall risk using logistic regression.
    
    Tasks to be Performed
    Import the required libraries
    Prepare the data (Data Proprocessing)
    Check for missing value
    Explore the data using EDA
    Training a logistic regression model
    Evaluating the model


In [2]:
df = pd.read_csv('/content/framingham.csv')
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


In [4]:
print(f"Shape of the data: {df.shape}")

Shape of the data: (4240, 16)


In [5]:
missing_columns = ['education', 'cigsPerDay', 'BPMeds', 'totChol', 'BMI', 'heartRate', 'glucose']

for column in missing_columns:
    print(f"Column: {column}, Mean: {np.round(df[column].mean(), 3)}, Median: {df[column].median()}, Mode: {df[column].mode()[0]}")

Column: education, Mean: 1.979, Median: 2.0, Mode: 1.0
Column: cigsPerDay, Mean: 9.006, Median: 0.0, Mode: 0.0
Column: BPMeds, Mean: 0.03, Median: 0.0, Mode: 0.0
Column: totChol, Mean: 236.7, Median: 234.0, Mode: 240.0
Column: BMI, Mean: 25.801, Median: 25.4, Mode: 22.19
Column: heartRate, Mean: 75.879, Median: 75.0, Mode: 75.0
Column: glucose, Mean: 81.964, Median: 78.0, Mode: 75.0


In [6]:
df['education'].fillna(df['education'].median(), inplace=True)
df['cigsPerDay'].fillna(df['cigsPerDay'].mean(), inplace=True)
df['BPMeds'].fillna(df['BPMeds'].mean(), inplace=True)
df['totChol'].fillna(df['totChol'].median(), inplace=True)
df['BMI'].fillna(df['BMI'].median(), inplace=True)
df['heartRate'].fillna(df['heartRate'].median(), inplace=True)
df['glucose'].fillna(df['glucose'].median(), inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4240 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4240 non-null   float64
 5   BPMeds           4240 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4240 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4240 non-null   float64
 13  heartRate        4240 non-null   float64
 14  glucose          4240 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


In [8]:
!pip install imbalanced-learn



In [14]:
from imblearn.over_sampling import SMOTE

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

print(f'Shape of X and y before SMOTE: {X.shape}, {y.shape}')

sm = SMOTE()
X, y = sm.fit_sample(X, y)

print(f'Shape of X and y before SMOTE: {X.shape}, {y.shape}')

Shape of X and y before SMOTE: (4240, 15), (4240,)
Shape of X and y before SMOTE: (7192, 15), (7192,)


In [17]:
pd.Series(y).value_counts()

1    3596
0    3596
dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.2)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

lr = LogisticRegression()

Shape of X_train: (5753, 15)
Shape of X_test: (1439, 15)
Shape of y_train: (5753,)
Shape of y_test: (1439,)


In [19]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_pred = lr.predict(X_test)

In [21]:
confusion_matrix(y_test, y_pred)

array([[437, 308],
       [229, 465]])

In [22]:
precision_score(y_test, y_pred)

0.6015523932729625

In [23]:
recall_score(y_test, y_pred)

0.670028818443804

In [24]:
f1_score(y_test, y_pred)

0.6339468302658486

In [25]:
accuracy_score(y_test, y_pred)

0.6268241834607367