In [1]:
import csv
import sqlite3
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Parsing Data, Database Creation, Data Processing

In [2]:
def create_table(conn, create_table_sql, drop_table_name=None):
    
    if drop_table_name: # You can optionally pass drop_table_name to drop the table. 
        try:
            c = conn.cursor()
            c.execute("""DROP TABLE IF EXISTS %s""" % (drop_table_name))
        except Error as e:
            print(e)
    
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

Tables to make:

```
[Patients]
    [PatientID] INTEGER NOT NULL PRIMARY KEY
    [Age] REAL NOT NULL
    [Gender] TEXT NOT NULL
    [Married] TEXT NOT NULL
    [WorkType] TEXT NOT NULL
    [ResidenceType] TEXT NOT NULL

[HealthDetails]
    [PatientID] INTEGER NOT NULL PRIMARY KEY FOREIGN KEY TO Patients(PatientID)
    [HeartDisease] INTEGER NOT NULL
    [HyperTension] INTEGER NOT NULL
    [AvgGlucoseLevel] REAL NOT NULL
    [BMI] REAL
    [Smoker] TEXT NOT NULL

[Strokes]
    [PatientID] INTEGER NOT NULL PRIMARY KEY FOREIGN KEY TO Patients(PatientID)
    [Stroke] INTEGER NOT NULL
```

In [3]:
def create_strokes_db(datafile, db_name, delete_db = False):
    
    # Deletes existing database file, if desired
    if delete_db:
        import os
        if os.path.isfile(db_name):
            os.remove(db_name)
        else:
            raise FileNotFoundError(f"{db_name} does not exist.")
    
    # Create connection to database
    conn = sqlite3.connect(db_name)
    
    # Read and parse data
    header = None
    data = []
    with open(datafile) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
            if header == None:
                header = row
                continue
            data.append(row)

    # Data to insert
    patient_list = [(int(i[0]), i[2], i[1], i[5], i[6], i[7]) for i in data]
    health_dets = [(int(i[0]), i[4], i[3], i[-4], i[-3], i[-2]) for i in data]
    strokes = [(int(i[0]), int(i[-1])) for i in data]
    
    # Create tables
    create_patients_table_sql = """
    CREATE TABLE IF NOT EXISTS [Patients](
        [PatientID] INTEGER NOT NULL PRIMARY KEY,
        [Age] REAL,
        [Gender] TEXT,
        [Married] TEXTL,
        [WorkType] TEXT,
        [ResidenceType] TEXT);"""

    create_healthdetails_table_sql = """
    CREATE TABLE IF NOT EXISTS [HealthDetails](
        [PatientID] INTEGER NOT NULL PRIMARY KEY,
        [HeartDisease] INTEGER,
        [HyperTension] INTEGER,
        [AvgGlucoseLevel] REAL,
        [BMI] REAL,
        [Smoker] TEXT,
        FOREIGN KEY(PatientID) REFERENCES Patients(PatientID));
    """

    create_strokes_table_sql = """
    CREATE TABLE IF NOT EXISTS [Strokes](
        [PatientID] INTEGER NOT NULL PRIMARY KEY,
        [Stroke] INTEGER NOT NULL,
        FOREIGN KEY(PatientID) References Patients(PatientID));
    """
    
    # Insert statements
    insert_patients = """
    INSERT INTO Patients(
        PatientID,
        Age,
        Gender,
        Married,
        WorkType,
        ResidenceType) VALUES (?, ?, ?, ?, ?, ?)"""
    
    insert_healthdetails = """
    INSERT INTO HealthDetails(
        PatientID,
        HeartDisease,
        HyperTension,
        AvgGlucoseLevel,
        BMI,
        Smoker) VALUES (?, ?, ?, ?, ?, ?)"""

    insert_strokes = "INSERT INTO Strokes(PatientID, Stroke) VALUES (?, ?)"
    
    # Create tables and insert values
    with conn:
        cur = conn.cursor()
        create_table(conn, create_patients_table_sql, drop_table_name = "Patients")
        create_table(conn, create_healthdetails_table_sql, drop_table_name = "HealthDetails")
        create_table(conn, create_strokes_table_sql, drop_table_name = "Strokes")

        cur.executemany(insert_patients, patient_list)
        cur.executemany(insert_healthdetails, health_dets)
        cur.executemany(insert_strokes, strokes)
    
    conn.close()

In [12]:
create_strokes_db("stroke_data.csv", "stroke.db", delete_db = False)

In [4]:
conn = sqlite3.connect("stroke.db")

In [15]:
# Check correctly inserted
df = pd.read_sql_query("SELECT * FROM HealthDetails", conn)
df

Unnamed: 0,PatientID,HeartDisease,HyperTension,AvgGlucoseLevel,BMI,Smoker
0,67,0,0,92.97,,formerly smoked
1,77,0,0,85.81,18.6,Unknown
2,84,0,0,89.17,31.5,never smoked
3,91,0,0,98.53,18.5,never smoked
4,99,0,0,108.89,52.3,Unknown
...,...,...,...,...,...,...
5105,72911,0,1,129.54,60.9,smokes
5106,72914,0,0,90.57,24.2,Unknown
5107,72915,0,0,172.33,45.3,formerly smoked
5108,72918,0,1,62.55,30.3,Unknown


In [5]:
# Join data into one dataframe
join_statement = """
SELECT
    p.PatientID, Age, Gender,
    ResidenceType, HeartDisease, HyperTension,
    AvgGlucoseLevel, BMI, Smoker,
    Stroke
FROM Patients AS p
INNER JOIN HealthDetails AS hd
ON p.PatientID = hd.PatientID
INNER JOIN Strokes AS s
ON s.PatientID = p.PatientID
"""

data = pd.read_sql_query(join_statement, conn)

In [6]:
data.head()

Unnamed: 0,PatientID,Age,Gender,ResidenceType,HeartDisease,HyperTension,AvgGlucoseLevel,BMI,Smoker,Stroke
0,67,17.0,Female,Urban,0,0,92.97,,formerly smoked,0
1,77,13.0,Female,Rural,0,0,85.81,18.6,Unknown,0
2,84,55.0,Male,Urban,0,0,89.17,31.5,never smoked,0
3,91,42.0,Female,Urban,0,0,98.53,18.5,never smoked,0
4,99,31.0,Female,Urban,0,0,108.89,52.3,Unknown,0


In [7]:
import numpy as np
# Check for missing values
data["BMI"] = data["BMI"].apply(lambda x: np.nan if x == "N/A" else x) # Convert "N/A" to NaN

bmi_female, bmi_male, bmi_other = data[["Gender", "BMI"]].groupby("Gender").mean()["BMI"] # Gender-specific averages
bmi_female, bmi_male, bmi_other

(29.065757680358992, 28.64793635007459, 22.4)

In [8]:
data["Gender"].unique()
data["Smoker"].unique()

array(['Female', 'Male', 'Other'], dtype=object)

array(['formerly smoked', 'Unknown', 'never smoked', 'smokes'],
      dtype=object)

In [9]:
data.head()

Unnamed: 0,PatientID,Age,Gender,ResidenceType,HeartDisease,HyperTension,AvgGlucoseLevel,BMI,Smoker,Stroke
0,67,17.0,Female,Urban,0,0,92.97,,formerly smoked,0
1,77,13.0,Female,Rural,0,0,85.81,18.6,Unknown,0
2,84,55.0,Male,Urban,0,0,89.17,31.5,never smoked,0
3,91,42.0,Female,Urban,0,0,98.53,18.5,never smoked,0
4,99,31.0,Female,Urban,0,0,108.89,52.3,Unknown,0


In [10]:
# Replace BMI missing values with gender-specific BMI averages
data["BMI"] = np.where((np.isnan(data["BMI"])) & (data["Gender"] == "Female"), bmi_female, data["BMI"])
data["BMI"] = np.where((np.isnan(data["BMI"])) & (data["Gender"] == "Male"), bmi_male, data["BMI"])

In [11]:
data.isna().sum()

PatientID          0
Age                0
Gender             0
ResidenceType      0
HeartDisease       0
HyperTension       0
AvgGlucoseLevel    0
BMI                0
Smoker             0
Stroke             0
dtype: int64

In [12]:
data.loc[data["Gender"] == "Other"]

Unnamed: 0,PatientID,Age,Gender,ResidenceType,HeartDisease,HyperTension,AvgGlucoseLevel,BMI,Smoker,Stroke
3926,56156,26.0,Other,Rural,0,0,143.33,22.4,formerly smoked,0


In [13]:
# Drop from dataset and reset indices
data.drop(index = 3926, inplace=True)
data.reset_index(drop=True, inplace=True)

In [14]:
data.head()

Unnamed: 0,PatientID,Age,Gender,ResidenceType,HeartDisease,HyperTension,AvgGlucoseLevel,BMI,Smoker,Stroke
0,67,17.0,Female,Urban,0,0,92.97,29.065758,formerly smoked,0
1,77,13.0,Female,Rural,0,0,85.81,18.6,Unknown,0
2,84,55.0,Male,Urban,0,0,89.17,31.5,never smoked,0
3,91,42.0,Female,Urban,0,0,98.53,18.5,never smoked,0
4,99,31.0,Female,Urban,0,0,108.89,52.3,Unknown,0


In [215]:
# Save data to csv file; uncomment if needed
#data.to_csv("processed_data.csv", index = False)

In [15]:
# Read data from saved csv, uncomment if needed
#data = pd.read_csv("processed_data.csv")

## Basic Data Analysis Section

In [105]:
import plotly.express as px
#import plotly.graph_objects as go
#from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
# Find size of entire dataset
np.shape(data)[0]
# Find how many had strokes
np.shape(data.loc[data["Stroke"] == 1])[0]

# number of non-strokes is 5109 - 249
np.shape(data)[0] - np.shape(data.loc[data["Stroke"] == 1])[0]

5109

249

4860

In [57]:
# Basic descriptive statistics of Age, AvgGlucoseLevel and BMI; entire dataset
data.describe().drop(columns = "PatientID")[["Age", "AvgGlucoseLevel", "BMI"]]

Unnamed: 0,Age,AvgGlucoseLevel,BMI
count,5109.0,5109.0,5109.0
mean,43.229986,106.140399,28.89279
std,22.613575,45.285004,7.698351
min,0.08,55.12,10.3
25%,25.0,77.24,23.8
50%,45.0,91.88,28.4
75%,61.0,114.09,32.8
max,82.0,271.74,97.6


In [65]:
# Basic descriptive statistics of numeric columns; grouped by gender
data.groupby(by = "Gender").describe()[["Age", "AvgGlucoseLevel", "BMI"]].T

Unnamed: 0,Gender,Female,Male
Age,count,2994.0,2115.0
Age,mean,43.757395,42.483385
Age,std,21.966561,23.484066
Age,min,0.08,0.08
Age,25%,27.0,22.0
Age,50%,44.0,46.0
Age,75%,61.0,61.0
Age,max,82.0,82.0
AvgGlucoseLevel,count,2994.0,2115.0
AvgGlucoseLevel,mean,104.057809,109.08852


In [72]:
gender_bmi_fig = px.histogram(data, x = "BMI", color="Gender", nbins=100, title = "Histogram of BMI by Gender")
gender_bmi_fig.show()

In [78]:
ht_bmi_fig = px.histogram(data, x = "BMI", color= "HyperTension", nbins=100, title = "Histogram of BMI by HyperTension")
ht_bmi_fig.show()

In [82]:
hd_bmi_fig = px.histogram(data, x = "BMI", color= "HeartDisease", nbins=100, title = "Histogram of BMI by HeartDisease")
hd_bmi_fig.show()

In [104]:
# Visualize strokes data
fig1 = px.pie(data, values='Stroke', names='Gender', title='Strokes by Gender')
fig2 = px.pie(data, values='Stroke', names='HyperTension', title='Strokes by HyperTension')
fig1.show()
fig2.show()

In [None]:
def aggregate_summary(feature):
    """
    Calculates the mean AverageGlucoseLevel, mean BMI, and total number of Strokes,
    grouped by << feature >> (Gender, HyperTension, HeartDisease, ResidenceType, Smoker).
    
    Parameters:
    feature <str>: category of interest in dataframe
    
    Returns:
    agg_summary <dataframe>: dataframe of mean Age, mean AverageGlucoseLevel, mean BMI, total Strokes, grouped by feature
    """
    df1 = data.groupby(feature)["Age"].agg("mean").reset_index()
    df2 = data.groupby(feature)["AvgGlucoseLevel"].agg("mean").reset_index()
    df3 = data.groupby(feature)["BMI"].agg("mean").reset_index()
    df4 = data.groupby(feature)["Stroke"].agg("sum").reset_index()
    
    agg_summary = df1.merge(df2.merge(df3.merge(df4)))
    
    return agg_summary

In [173]:
agg_gender = aggregate_summary("Gender")
agg_ht = aggregate_summary("HyperTension")
agg_hd = aggregate_summary("HeartDisease")
agg_rt = aggregate_summary("ResidenceType")
agg_smoker = aggregate_summary("Smoker")

agg_gender
agg_ht
agg_hd
agg_rt
agg_smoker

Unnamed: 0,Gender,Age,AvgGlucoseLevel,BMI,Stroke
0,Female,43.757395,104.057809,29.065758,141
1,Male,42.483385,109.08852,28.647936,108


Unnamed: 0,HyperTension,Age,AvgGlucoseLevel,BMI,Stroke
0,0,41.176318,103.543088,28.487717,183
1,1,62.24498,130.188996,32.643371,66


Unnamed: 0,HeartDisease,Age,AvgGlucoseLevel,BMI,Stroke
0,0,41.804676,104.388438,28.821757,202
1,1,68.188406,136.818768,30.136643,47


Unnamed: 0,ResidenceType,Age,AvgGlucoseLevel,BMI,Stroke
0,Rural,42.907537,106.360529,28.894636,114
1,Urban,43.542126,105.927307,28.891003,135


Unnamed: 0,Smoker,Age,AvgGlucoseLevel,BMI,Stroke
0,Unknown,30.229922,99.601541,25.790913,47
1,formerly smoked,54.96267,112.85164,30.651628,70
2,never smoked,46.744715,107.558092,29.960085,90
3,smokes,47.096324,108.01744,30.43292,42


In [171]:
# aggregate of numeric categories by stroke
agl_stroke = data.groupby("Stroke")["AvgGlucoseLevel"].mean().reset_index()
bmi_stroke = data.groupby("Stroke")["BMI"].mean().reset_index()
age_stroke = data.groupby("Stroke")["Age"].mean().reset_index()

agl_stroke
bmi_stroke
age_stroke

Unnamed: 0,Stroke,AvgGlucoseLevel
0,0,104.787584
1,1,132.544739


Unnamed: 0,Stroke,BMI
0,0,28.825118
1,1,30.213621


Unnamed: 0,Stroke,Age
0,0,41.974831
1,1,67.728193


## Applying Machine Learning Techniques