In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import os
import sys
import pprint
import matplotlib.pyplot as plt
import scipy.stats as scistat
import math
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Stroke Dataset

#### Importing dataset

Two dataset separatedly: first for data analysis and second for predict model using dummy data.

In [None]:
stroke_df = pd.read_csv(r'/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', index_col='id')
pd.options.display.float_format = '{:,.2f}'.format

stroke_data = pd.read_csv(r'/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', index_col='id')
pd.options.display.float_format = '{:,.2f}'.format

#### Getting a first look at data:

In [None]:
def getfirstlook(df, nrows=5, uniqueids=None):
    out = {}
    out['head'] = df.head(nrows)
    out['dtypes'] = df.dtypes
    out['nrows'] = df.shape[0]
    out['ncols'] = df.shape[1]
    out['index'] = df.index
    if (uniqueids is not None):
        out['uniqueids'] = df[uniqueids].nunique()
    return out

getfirstlook(stroke_df)

#### Searching for missing table's missing values

In [None]:
def getmissings(df, byrowperc=False):
    return df.isnull().sum(), df.isnull().sum(axis=1).value_counts(normalize=byrowperc).sort_index()

missingsbycols, missingbyrows = getmissings(stroke_df[['bmi']])

In [None]:
stroke_df.isnull().sum()

In [None]:
missingbyrows

In [None]:
missingsbycols

#### Replacing affirmative answers and dummie's data to boolean values

In [None]:
marry_data = { 'Yes': True, 'No': False}
stroke_df['ever_married'] = stroke_df['ever_married'].replace(marry_data)

In [None]:
for col in ['hypertension', 'heart_disease', 'ever_married', 'stroke']:
    stroke_df[col] = stroke_df[col].astype('bool')

#### Converting float age to integer

In [None]:
for col in ['age']:
    stroke_df[col] = stroke_df[col].astype('int')

#### Fill missing values with mean's value

In [None]:
stroke_df['bmi'] = stroke_df['bmi'].fillna(stroke_df['bmi'].mean())

stroke_data['bmi'] = stroke_data['bmi'].fillna(stroke_data['bmi'].mean())

#### Converting object series into categorical data

In [None]:
stroke_df.loc[:, stroke_df.dtypes == 'object'] = stroke_df.select_dtypes(['object']).apply(lambda x: x.astype('category'))

#### Counting categorical data

#### This following code makes a text file. It drives forwards analysis

In [None]:
def makefreqs(df, outfile):
    freqout = open(outfile, 'w')
    for col in df.select_dtypes(include=['category']):
        print(col, "---------------", 'frequencies', 
        df[col].value_counts().sort_index(), 'percentages', 
        df[col].value_counts(normalize=True).sort_index(), 
        sep='\n\n', end='\n\n\n', file=freqout)
    freqout.close()
    
makefreqs(stroke_df, 'strokefreqs.txt')

In [None]:
def getcnts(df, cats, rowsel=None):
    tots = cats[:-1]
    catcnt = df.groupby(cats).size().reset_index(name='catcnt')
    totcnt = df.groupby(tots).size().reset_index(name='totcnt')
    percs = pd.merge(catcnt, totcnt, left_on=tots, right_on= tots, how="left")
    percs['percent'] = percs.catcnt / percs.totcnt
    if (rowsel is not None):
        percs = percs.loc[eval("percs." + rowsel)]
    return percs

getcnts(stroke_df, ['gender', 'work_type', 'Residence_type', 'smoking_status'])

In [None]:
getcnts(stroke_df, ['hypertension', 'heart_disease', 'ever_married', 'stroke'])

#### Gender distribution

The dataset is not distributed equally by gender. Most patients are female (58,6%). This could result in a biased analysis.

In [None]:
stroke_df['gender'].value_counts().plot(kind="pie", autopct="%.1f%%")

#### Replacing the value 'children' at column 'work_type'. This values are not right classificed.

In [None]:
job_data = { 'children': 'Others', 'Private': 'Private', 'Self-employed': 'Self-employed', 'Govt_job':'Govt_job', 'Never_worked': 'Never_worked'}
stroke_df['work_type'] = stroke_df['work_type'].replace(job_data)

### Bar plotting to demonstrate the patient's jobs.

##### This graph shows a trend of stroke at private sector workers bigger than another sectors.

In [None]:
stroke_df['work_type'].value_counts().plot(kind="barh")

#### Statistical analysis for continuous values

In [None]:
def getdistprops(seriestotest):
    out= {}
    normstat, normpvalue = scistat.shapiro(seriestotest)
    if (not math.isnan(normstat)):
        out['normstat'] = normstat
        if (normpvalue >= 0.05):
            out['normpvalue'] = str(round(normpvalue, 2)) + ": Accept Normal"
        elif (normpvalue < 0.05):
            out['normpvalue'] = str(round(normpvalue, 2)) + ": Reject Normal"
        out['mean'] = seriestotest.mean()
        out['median'] = seriestotest.median()
        out['std']= seriestotest.std()
        out['kurtosis'] = seriestotest.kurtosis()
        out['skew'] = seriestotest.skew()
        out['count'] = seriestotest.count()
        return out

### The foward histograms shows the average age is 43 years old and commonly appears at 45 years old.

In [None]:
def makeplot(seriestoplot, title, xlabel, plottype='hist'):
    if (plottype=='hist'):
        plt.hist(seriestoplot)
        plt.axvline(seriestoplot.mean(), color='red', linestyle='dashed', linewidth=1)
        plt.xlabel(xlabel)
        plt.ylabel('Frequency')
    elif (plottype=='box'):
        plt.boxplot(seriestoplot.dropna(), labels=[xlabel])
    plt.title(title)
    plt.show()

In [None]:
makeplot(stroke_df.age, "Patient's Ages", "Age")

In [None]:
dist = getdistprops(stroke_df.age)
pprint.pprint(dist)

### Glucose level
"A blood sugar level less than 140 mg/dL (7.8 mmol/L) is normal. A reading of more than 200 mg/dL (11.1 mmol/L) after two hours indicates diabetes. A reading between 140 and 199 mg/dL (7.8 mmol/L and 11.0 mmol/L) indicates prediabetes." [Mayo Clinic website, diabetes treatment](https://www.mayoclinic.org/diseases-conditions/diabetes/diagnosis-treatment/drc-20371451)

In [None]:
makeplot(stroke_df.avg_glucose_level, "Mean Glocose Level", "Glucose Level")

#### This graph aboves demonstrates mostly patients has no diabetes.

In [None]:
dist1 = getdistprops(stroke_df.avg_glucose_level)
pprint.pprint(dist1)

#### A BMI of 25.0 or more is overweight, while the healthy range is 18.5 to 24.9. BMI applies to most adults 18-65 years. [Body Mass Index](https://www.diabetes.ca/managing-my-diabetes/tools---resources/body-mass-index-(bmi)-calculator)

In [None]:
makeplot(stroke_df.bmi, "Body Mass Index", "Body Mass")

#### The histogram above help us to understand that the average's patients is founded overweighted.

In [None]:
dist2 = getdistprops(stroke_df.bmi)
pprint.pprint(dist2)

#### Looking for outliers

In [None]:
sumvars = ['avg_glucose_level','bmi', 'age']
othervars = ['gender', 'ever_married', 'hypertension' , 'heart_disease', 'work_type', 'Residence_type', 'smoking_status', 'stroke']

In [None]:
def getoutliers(dfin, sumvars, othervars):
    dfin = dfin[sumvars + othervars]
    dfout = pd.DataFrame(columns=dfin.columns, data=None)
    dfsums = dfin[sumvars]
    for col in dfsums.columns:
        thirdq, firstq = dfsums[col].quantile(0.75), dfsums[col].quantile(0.25)
        interquartilerange = 1.5 * (thirdq-firstq)
        outlierhigh, outlierlow = interquartilerange + thirdq, firstq - interquartilerange
        df = dfin.loc[(dfin[col]>outlierhigh) | (dfin[col]<outlierlow)]
        df = df.assign(varname = col, threshlow = outlierlow, threshhigh = outlierhigh)
        dfout = pd.concat([dfout, df])
    return dfout

def makeplot(seriestoplot, title, xlabel, plottype='hist'):
    if (plottype=='hist'):
        plt.hist(seriestoplot)
        plt.axvline(seriestoplot.mean(), color='red', linestyle='dashed', linewidth=1)
        plt.xlabel(xlabel)
        plt.ylabel('Frequency')
    elif (plottype=='box'):
        plt.boxplot(seriestoplot.dropna(), labels=[xlabel])
    plt.title(title)
    plt.show()

##### Outliers:

#### Box plot to demonstrates the statiscal data and the outliers.

In [None]:
makeplot(stroke_df.bmi, "Body Mass Index", "Body Mass", 'box')

In [None]:
outliers = getoutliers(stroke_df, sumvars, othervars)
outliers.varname.value_counts(sort=False)

In [None]:
outliers.loc[outliers.varname=='avg_glucose_level', othervars + sumvars]

In [None]:
makeplot(stroke_df.avg_glucose_level, "Mean Glocose Level", "Glucose Level", 'box')

In [None]:
outliers.loc[outliers.varname=='bmi', othervars + sumvars]

### Building Your First Model: k-Nearest Neighbors

#### Using this model we can predict, with the variables Age, Hypertension, Heart's diseases, Glicose level and Body mass, 92% accurately. 

In [None]:
features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(stroke_data[features], stroke_data['stroke'], random_state=0)

In [None]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

In [None]:
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)
print("Test set predictions:\n {}".format(y_pred))


In [None]:
print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))