In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This seemed like a fun dataset to do a quick analysis on. I'll use a stacked classifier to see if I can determine the sex of these patients based on the other fields.

Having little understanding of the data I have skipped feature engineering. It seems to me that with a bit more domain knowledge would allow a better prediction based on the creation of new features. 

In [None]:
heart_df=pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")
heart_df.info()

In [None]:
print(heart_df['sex'].unique())
heart_df['sex'].hist()

Looks like above the sex is unbalanced. Not a great start but easily dealt with.

In [None]:
X = heart_df
X = X.drop('sex', axis=1)
y = heart_df['sex']

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=0)
scores = cross_val_score(clf, X, y, cv=5)
print(scores.mean())

70% is my baseline established above. Using synthetic samples I hope to do better than that.

In [None]:
from imblearn.over_sampling import SMOTE

smt = SMOTE()
X_sm, y_sm = smt.fit_resample(X, y)
scores = cross_val_score(clf, X_sm, y_sm, cv=5)
print(scores.mean())

80% is a pretty big improvement for very little effort with synthetic sampling.

Now we can create a quick stacked classifier to see if we can improve on this. It would be ideal to do a grid search or bayesian optimisation to improve the parameters but I want to keep it simple.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

clf_2 = ExtraTreesClassifier(random_state=0)
scores = cross_val_score(clf_2, X_sm, y_sm, cv=5)
print(scores.mean())


In [None]:
from catboost import CatBoostClassifier
clf_3 = CatBoostClassifier(random_state=0, verbose=False)
scores = cross_val_score(clf_3, X_sm, y_sm, cv=5)
print(scores.mean())

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stack_clf = StackingClassifier(estimators=[('extratrees', clf_2), ('gradboost', clf), ('catboost', clf_3)], final_estimator=LogisticRegression(), verbose=0)
scores = cross_val_score(stack_clf, X_sm, y_sm, cv=5)
print(scores.mean())

81.4% is the final result. Better than I expected coming in. 