In [None]:
import os
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**I will read the data and consider the hypotheses that arise from analyzing the data**
# #1 Getting Startedf1_score

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.info()

**I think that if delete the missing data, it will not affect the further learning,  
because it is not a good idea to fill in the data mean. For this reason, we should just delete them.**

In [None]:
df['bmi'] = df['bmi'].fillna(value=df['bmi'].mean())
df.info()

**I will now look at the uniqueness of the categorical data**

In [None]:
[
    print(f'{col_name}: {df[col_name].unique()}') 
    for col_name in [
        col for col in df.columns 
        if df[col].dtype == 'object'
    ]
]

**It seems to me that the ever_married and residence_type fields are not the main ones here, why do I think so.  
I'm not an expert on this, I'd have to read more, but I don't think a stroke depends on whether a person is married or not.  
I think the same with the type of residence, I think that a person wherever they are is not immune to it.  
To prove or disprove this, I will construct graphs.**

## #1.1 Visualization of married and unmarried people

In [None]:
married = df[df['ever_married'] == 'Yes'].groupby(['stroke'])['ever_married'].count().reset_index()
unmarried = df[df['ever_married'] == 'No'].groupby(['stroke'])['ever_married'].count().reset_index()

**As we can see in the graphs, the possibility of stroke is greater for married people.  
Apparently I was wrong about this field being unnecessary.  
Most likely it does not depend on whether the person was married or not,  
but depends on the age of the person, i.e. his age is this dependence.**

In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Married')
sns.barplot(x='stroke', y='ever_married', hue='stroke', data=married)
plt.subplot(2,2,2)
plt.title('Unmarried')
sns.barplot(x='stroke', y='ever_married', hue='stroke', data=unmarried)
plt.show()

## #1.2 Urban and rural visualization

In [None]:
urban = df[df['Residence_type'] == 'Urban'].groupby(['stroke'])['Residence_type'].count().reset_index()
rural = df[df['Residence_type'] == 'Rural'].groupby(['stroke'])['Residence_type'].count().reset_index()

**As we can see on the graphs, the data are not very different,  
so I think this field is really not very necessary, so I will  
eliminate it for training in the future.**

In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Urban')
sns.barplot(x='stroke', y='Residence_type', hue='stroke', data=urban)
plt.subplot(2,2,2)
plt.title('Rural')
sns.barplot(x='stroke', y='Residence_type', hue='stroke', data=rural)
plt.show()

## #1.3 Visualization of work types 

In [None]:
private = df[df['work_type'] == 'Private'].groupby(['stroke'])['work_type'].count().reset_index()
self_employed = df[df['work_type'] == 'Self-employed'].groupby(['stroke'])['work_type'].count().reset_index()
govt_job = df[df['work_type'] == 'Govt_job'].groupby(['stroke'])['work_type'].count().reset_index()
children = df[df['work_type'] == 'children'].groupby(['stroke'])['work_type'].count().reset_index()
never_worked = df[df['work_type'] == 'Never_worked'].groupby(['stroke'])['work_type'].count().reset_index()

**As we can see from the graphs, you can see that they are almost similar to each other.  
I can note only two types of different graphs from the other graphs. They are children and the unemployed.  
So I don't think you can predict anything good from these charts and this data, because the data is similar, ~25 units different**

In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Private')
sns.barplot(x='stroke', y='work_type', hue='stroke', data=private)
plt.subplot(2,2,2)
plt.title('Self-employed')
sns.barplot(x='stroke', y='work_type', hue='stroke', data=self_employed)
plt.subplot(2,2,3)
plt.title('Govt_job')
sns.barplot(x='stroke', y='work_type', hue='stroke', data=govt_job)
plt.subplot(2,2,4)
plt.title('Children')
sns.barplot(x='stroke', y='work_type', hue='stroke', data=children)
plt.show()
plt.title('Nver_worked')
sns.barplot(x='stroke', y='work_type', hue='stroke', data=never_worked)
plt.show()

## #1.4 Visualization by status of smoking

In [None]:
formerly_smoked = df[df['smoking_status'] == 'formerly smoked'].groupby(['stroke'])['smoking_status'].count().reset_index()
never_smoked = df[df['smoking_status'] == 'never smoked'].groupby(['stroke'])['smoking_status'].count().reset_index()
smokes = df[df['smoking_status'] == 'smokes'].groupby(['stroke'])['smoking_status'].count().reset_index()
unknown = df[df['smoking_status'] == 'Unknown'].groupby(['stroke'])['smoking_status'].count().reset_index()

**What can tell from these graphs?  
I think that these graphs again do not reflect very well the possibility of stroke.**

In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Formerly smoked')
sns.barplot(x='stroke', y='smoking_status', hue='stroke', data=formerly_smoked)
plt.subplot(2,2,2)
plt.title('Never smoked')
sns.barplot(x='stroke', y='smoking_status', hue='stroke', data=never_smoked)
plt.subplot(2,2,3)
plt.title('Smokes')
sns.barplot(x='stroke', y='smoking_status', hue='stroke', data=smokes)
plt.subplot(2,2,4)
plt.title('Unknown')
sns.barplot(x='stroke', y='smoking_status', hue='stroke', data=unknown)
plt.show()

## #1.5 Visualization by gender

In [None]:
male = df[df['gender'] == 'Male'].groupby(['stroke'])['gender'].count().reset_index()
female = df[df['gender'] == 'Female'].groupby(['stroke'])['gender'].count().reset_index()
other = df[df['gender'] == 'Other'].groupby(['stroke'])['gender'].count().reset_index()

**On these graphs we can also see the similarity of these data, it turns out that there is nothing to judge from them.**

In [None]:
plt.figure(figsize=(15, 12))
plt.subplot(2,2,1)
plt.title('Male')
sns.barplot(x='stroke', y='gender', hue='stroke', data=male)
plt.subplot(2,2,2)
plt.title('Female')
sns.barplot(x='stroke', y='gender', hue='stroke', data=female)
plt.subplot(2,2,3)
plt.title('Other')
sns.barplot(x='stroke', y='gender', hue='stroke', data=other)
plt.show()

## Summary of these charts.
**I dare to assume that there is not much good to assume on these feature, because  
I did not notice a big change, that is, the data is not very different, although they belong to different categories.  
Having redone this job, I don't think should exclude any columns, let's see what happens.**

In [None]:
df.describe()

**You can see the minimum age of ~0, I do not know if this is good or not.**

# #2 Preprocessing

In [None]:
le = LabelEncoder()
object_col = [col for col in df.columns if df[col].dtype == 'object']
for col in object_col:
    df[col] = le.fit_transform(df[col])

In [None]:
df.info()

# #3 Modeling

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1:]
x_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.42,
                                                    random_state=42)

In [None]:
x_train = x_train.drop(axis=1, columns=['id'])
x_test = X_test.drop(axis=1, columns=['id'])

In [None]:
indexes = ['accuracy', 'f1_score']

## #3.1 LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [None]:
lr_pred = lr.predict(x_test)
lr_acc = accuracy_score(lr_pred, y_test)
lr_f1 = f1_score(lr_pred, y_test)
lr_frame = pd.DataFrame({
    'LogisticRegression': [lr_acc, lr_f1]
}, index=indexes)

## #3.2 RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

In [None]:
rfc_pred = rfc.predict(x_test)
rfc_acc = accuracy_score(rfc_pred, y_test)
rfc_f1 = f1_score(rfc_pred, y_test)
rfc_frame = pd.DataFrame({
    'RandomForestClassifier': [rfc_acc, rfc_f1]
}, index=indexes)

## #3.3 DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

In [None]:
tree_pred = tree.predict(x_test)
tree_acc = accuracy_score(tree_pred, y_test)
tree_f1 = f1_score(tree_pred, y_test)
tree_frame = pd.DataFrame({
    'DecisionTreeClassifier': [tree_acc, tree_f1]
}, index=indexes)

## #3.4 XGBClassifier

In [None]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)

In [None]:
xgb_pred = xgb.predict(x_test)
xgb_acc = accuracy_score(xgb_pred, y_test)
xgb_f1 = f1_score(xgb_pred, y_test)
xgb_frame = pd.DataFrame({
    'XGBClassifier': [xgb_acc, xgb_f1]
}, index=indexes)

In [None]:
pd.concat([lr_frame, rfc_frame, tree_frame, xgb_frame], axis=1)

# Saving the result

In [None]:
submission = pd.DataFrame({
    'id': X_test['id'],
    'stroke': xgb_pred
})
submission.to_csv('submission.csv', index=False)
submission

# Summary
**The great thing that we can notice is that even without playing with the parameters  
we can get a decent result. On an ordinary logistic regression. Maybe my result is not  
so good, but I think that, we need more information on stroke. Thank you for checking my work.  
I would be happy to have any comments and evaluations.**  
****
**f1_score = 0.0, I think this is because TP or TN is zero**

# Thanks for reading. Don't forget to upvote the work. Good luck kaggling!