# 1. Feature Engineering

In [None]:
## read the data
df = pd.read_csv('heart.csv')
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
## check for any duplicate values
df_duplicate=df[df.duplicated()].index
df_duplicate

In [None]:
## remove duplicates if any
df.drop_duplicates(inplace=True)
df

In [None]:
## missing values
df.isnull().sum()

In [None]:
df.hist(figsize=(15,20))

In [None]:
## checking the correlation between the variables
df.corr()
sns.heatmap(df.corr())

# 2. Feature Selection

In [None]:
## selecting the features that most important to the outcome

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

x=df.iloc[:,0:13]
y=df.iloc[:,-1]


## apply selectKbest to get top features
top_features=SelectKBest(score_func=chi2,k=10)
fit=top_features.fit(x,y)
df_scores=pd.DataFrame(fit.scores_)
df_cols=pd.DataFrame(x.columns)
df_cols

## getting list of features with scores corresponding to the ouput
feature_scores=pd.concat([df_cols,df_scores],axis=1)
feature_scores.columns=["Features","Scores"]
feature_scores

In [None]:
feature_scores=feature_scores.sort_values(by="Scores",ascending=False)
feature_scores

In [None]:
## lets plot the features vs scores 
plt.figure(figsize=(15,5))
sns.barplot(x="Features",y="Scores",data=feature_scores)

In [None]:
## Now we will select the top 10 features based on scores
feature_scores_10=feature_scores["Features"][:10].to_list()
feature_scores_10

In [None]:
## creating new dataframe with the top 10 features only and the output

df_10=df[['thalach',
 'oldpeak',
 'ca',
 'cp',
 'exang',
 'age',
 'chol',
 'trestbps',
 'slope',
 'sex',
 'target']]
df_10.head()

In [None]:
sns.heatmap(df_10.corr())

In [None]:
## check for outliers
df_10.describe()

In [None]:
## check for outliers
sns.boxplot(data=df_10)

In [None]:
## check for outliers
sns.pairplot(df_10,hue="sex")

In [None]:
## from the boxplot we see that chol has some outliers which we will try to identify and remove the outliers
sns.boxplot(df_10.chol)

In [None]:
## indentifying the outliers
outliers=df[df_10.chol>450].index
outliers

In [None]:
## dropping the outliers
after_drop=df_10.drop(index=outliers)
after_drop.shape

In [None]:
sns.boxplot(after_drop.chol)

In [None]:
## cleaned data

df_cleaned=after_drop
df_cleaned.shape

In [None]:
df_cleaned.head()

# 3. Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

# applying minmax scaler transformation to my df_cleaned 

df_scaled=pd.DataFrame(scaler.fit_transform(df_cleaned),columns=df_cleaned.columns)
df_scaled.head()

In [None]:
df_scaled.describe()

In [None]:
## Test Train data split
from sklearn.model_selection import train_test_split 

y=df_scaled["target"]
x=df_scaled.drop("target",axis=1)

x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=.7)

In [None]:
## Now we will check data is balanced or not

df_scaled.target.value_counts()

## we saw that our data is balanced so we will not do any undersampling or over sampling., we will try to plot it

sns.countplot(df_scaled.target)
plt.xlabel("Heart Disease")
plt.ylabel("No. of people")

### 3.1 Model building and evaluation

In [None]:
## Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

logit=LogisticRegression()
logit.fit(x_train,y_train)
y_pred=logit.predict(x_test)
print("The accuracy score for Logistic is {}".format(accuracy_score(y_test,y_pred)))
print("The precision score for Logistic is {}".format(precision_score(y_test,y_pred)))

sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)),annot=True)
plt.title("Confusion matrix")
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

## Now we will check model is underfit/overfit (if any)
accuracy_test=logit.score(x_test,y_test)
accuracy_train=logit.score(x_train,y_train)

print(" Accuracy score in Train Data : {}".format(accuracy_train))
print(" Accuracy score in Test Data : {}".format(accuracy_test))

## Model looks fine

In [None]:
## Decision Tree

from sklearn.tree import DecisionTreeClassifier

d_tree=DecisionTreeClassifier()
d_tree.fit(x_train,y_train)
y_pred=d_tree.predict(x_test)
print("The accuracy score for DT is {}".format(accuracy_score(y_test,y_pred)))
print("The precision score for DT is {}".format(precision_score(y_test,y_pred)))

sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)),annot=True)
plt.title("Confusion matrix")
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

## Now we will check model is underfit/overfit (if any)
accuracy_test=d_tree.score(x_test,y_test)
accuracy_train=d_tree.score(x_train,y_train)

print(" Accuracy score in Train Data : {}".format(accuracy_train))
print(" Accuracy score in Test Data : {}".format(accuracy_test))

## Model is overfit

In [None]:
## KNN

from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
y_pred=knn.predict(x_test)
print("The accuracy score for KNN is {}".format(accuracy_score(y_test,y_pred)))
print("The precision score for KNN is {}".format(precision_score(y_test,y_pred)))

sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)),annot=True)
plt.title("Confusion matrix")
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

## Now we will check model is underfit/overfit (if any)
accuracy_test=knn.score(x_test,y_test)
accuracy_train=knn.score(x_train,y_train)

print(" Accuracy score in Train Data : {}".format(accuracy_train))
print(" Accuracy score in Test Data : {}".format(accuracy_test))

## Model looks fine

## Model Selection

In [None]:
'''From the above results we will select Logistic Regression model based on accuracy and precision score
and also the model looks fine (not underfit/overfit). So we will do a cross validation before moving ahead to verify if the model 
will perform well to a new data set'''

from sklearn.model_selection import cross_val_score

cv_score=cross_val_score(logit,x,y,cv=5)

print("The cross validation score for Logistic Regression is: {0:.2f}%".format(cv_score.mean()*100))

## Model testing with new data

In [None]:
from sklearn.preprocessing import MinMaxScaler

def parameter():
    data=pd.read_csv("model.csv")
    
    scaler=MinMaxScaler()    
    data_transform=pd.DataFrame(scaler.fit_transform(data),columns=data.columns)
    
    final_y=logit.predict(data_transform)
    
    for index,value in enumerate(final_y):
        if value == 0:
            print("The person {} is safe.".format(index))
        if value == 1:
            print("The person {} is on risk.".format(index))

parameter()