In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [2]:
data = pd.read_csv("heart.csv")

In [3]:
data.shape

(303, 14)

In [4]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
#Checking if the target value which is predicting heart attack or not is balanced or not
data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
#test train split
train_X = data.drop('target',axis=1)
train_y = data['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2,random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(242, 13) (61, 13) (242,) (61,)


In [7]:
#Imbalanced target values hence accuracy score cannot be used as a measure for confirming modesl accuracy
y_train.value_counts()

1    133
0    109
Name: target, dtype: int64

In [8]:
X_train.dtypes.value_counts()

int64      12
float64     1
dtype: int64

In [9]:
X_train.isnull().sum() > 0

age         False
sex         False
cp          False
trestbps    False
chol        False
fbs         False
restecg     False
thalach     False
exang       False
oldpeak     False
slope       False
ca          False
thal        False
dtype: bool

In [10]:
X_train['restecg'].value_counts()

1    126
0    112
2      4
Name: restecg, dtype: int64

In [11]:
#How to check and decide if a column is categorical or quantitative ?
#Quantitative independent variables needs to be a similar scale as the other features
#Apply chisquare test for Null Hypothesis testing

In [12]:
# This should be used only when there is ordinal categorical values in columns
# from scipy.stats import stats, spearmanr
# pvalue = spearmanr(y_train,X_train['age'])
# pvalue

# for col in X_train.columns:
#     pvalue = spearmanr(y_train,X_train[col])
#     if(pvalue[1] > 0.05):
#         print(col)

In [40]:
for col in X_train.columns:
    if(X_train[col].unique().size > 10):
        print(col)

age
trestbps
chol
thalach
oldpeak


In [150]:
age_modified = pd.DataFrame(pd.cut(X_train['age'], 8))

In [151]:
age_modified.rename(columns={'age':'binned_age'}, inplace=True)

In [152]:
newDF = pd.concat([X_train['age'],age_modified], axis=1)

In [153]:
newDF.head().sort_values(by=['age'])

Unnamed: 0,age,binned_age
132,42,"(41.0, 47.0]"
196,46,"(41.0, 47.0]"
75,55,"(53.0, 59.0]"
202,58,"(53.0, 59.0]"
176,60,"(59.0, 65.0]"


In [158]:
age_modified['binned_age'].unique()

[(41.0, 47.0], (53.0, 59.0], (59.0, 65.0], (35.0, 41.0], (47.0, 53.0], (65.0, 71.0], (71.0, 77.0], (28.952, 35.0]]
Categories (8, interval[float64]): [(28.952, 35.0] < (35.0, 41.0] < (41.0, 47.0] < (47.0, 53.0] < (53.0, 59.0] < (59.0, 65.0] < (65.0, 71.0] < (71.0, 77.0]]

In [13]:
from sklearn.feature_selection import chi2

In [14]:
chi2, pval = chi2(X_train,y_train)

In [15]:
pval

array([5.99483352e-06, 5.58862286e-03, 8.04169118e-11, 1.70514926e-02,
       2.60592086e-03, 9.89288517e-01, 3.52448047e-01, 7.81650644e-29,
       1.12673139e-08, 2.82261693e-14, 6.87361846e-03, 2.72124999e-12,
       3.99653533e-02])

In [16]:
pvalues = pd.Series(data=pval, index=X_train.columns)

In [17]:
pvalues[pvalues>0.05]

fbs        0.989289
restecg    0.352448
dtype: float64

In [18]:
X_train.drop(labels=['fbs','restecg'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# cross_table = pd.crosstab(target, independent_variable margins = True)
# chi2_value, p, dof, ex_table = stats.chi2_contingency(cross_table, correction=False)

In [19]:
from sklearn.linear_model import LogisticRegression 
regressor = LogisticRegression() 
regressor.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Train Set Scores

In [20]:
train_prediction = regressor.predict(X_train)

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [22]:
accuracy_score(y_train, train_prediction)

0.8553719008264463

In [23]:
con_mat_train = confusion_matrix(y_train, train_prediction)
con_mat_train

array([[ 85,  24],
       [ 11, 122]], dtype=int64)

In [24]:
print(classification_report(y_train, train_prediction))

              precision    recall  f1-score   support

           0       0.89      0.78      0.83       109
           1       0.84      0.92      0.87       133

    accuracy                           0.86       242
   macro avg       0.86      0.85      0.85       242
weighted avg       0.86      0.86      0.85       242



### Test Set Scores

In [25]:
X_test.drop(labels=['fbs','restecg'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [26]:
test_prediction = regressor.predict(X_test)

In [27]:
accuracy_score(y_test, test_prediction)

0.8852459016393442

In [28]:
con_mat_test = confusion_matrix(y_test, test_prediction)
con_mat_test

array([[25,  4],
       [ 3, 29]], dtype=int64)

In [29]:
print(classification_report(y_test, test_prediction))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61

