# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

# Loading the Data

In [None]:
raw_data=pd.read_csv('../input/health-care-data-set-on-heart-attack-possibility/heart.csv')
raw_data.head()

In [None]:
raw_data.describe(include='all') ### describing the data

#### As per the statistical  table , we are able to conclude that their is no missing value in the data

# Analyzing the data

In [None]:
plt.scatter(raw_data['age'],raw_data['target'])
plt.xlabel('age')
plt.ylabel('deaths from heart attack')
plt.show()

### As we can clearly see that there is no relation between the age and the heart attack




In [None]:
sns.distplot(raw_data['age'])

# LOGISTIC MODEL

In [None]:
data=raw_data.copy()
data.columns.values

In [None]:
estimators=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

## Describing the features and target

In [None]:
y=data['target'] ### defining the target value (dependent variable)
x1=data[estimators] ### defining the features (independent variable)

## Creating regression

In [None]:
x=sm.add_constant(x1) ### adding constant
reg_log =sm.Logit(y,x) 
result_log = reg_log.fit() ### fitting the regression

In [None]:
result_log.summary() ### MLE table

## As we can see that p-value of AGE and FBS are high and hence they insignificant in our regression model

In [None]:
#### DROPPING THE  INSIGNIFICANT VARIABLES
data_new=data.drop(['age','fbs'],axis=1)

In [None]:
data_new.head()

In [None]:
data_new.columns.values

In [None]:
estimators_new=['sex', 'cp', 'trestbps', 'chol', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']

## Again running the regression after dropping insignificant variables

In [None]:
y_=data_new['target']
x1_= data_new[estimators_new]

In [None]:
x_=sm.add_constant(x1_)
reg_log_ =sm.Logit(y_,x_)
result_log_ = reg_log_.fit()

In [None]:
result_log_.summary()

In [None]:
np.exp(-1.7640)

### From this we can conclude that, Given the same attributes, ODDS of getting a heart attack in male are 0.17 higher than that of female. 
## As this value is not very significant, but for knowledge purpose we can conclude that

# Calculating the Accuracy of the model

## Confusion matrix

In [None]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [None]:
confusion_matrix(x_,y_,result_log_)

### Hence this model is 85% accurate