Change column contents from integers to strings, upload csv to SQL


In [1]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
# read csv file
df = pd.read_csv("./Datasets/healthcare-dataset-stroke-data.csv")


In [4]:
# drop nan values
df = df.dropna(axis=0)
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
# merge 'children' and 'Never_worked' categories
df = df.replace({'work_type': {'Never_worked':'no_work', 'children': 'no_work'}})
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,no_work,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [6]:
# change hypertension answer from integers to string
df['hypertension'] = df['hypertension'].replace({0: 'No', 1: 'Yes'})
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,No,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,No,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,No,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,Yes,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,No,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,No,0,No,no_work,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,No,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,No,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,No,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [7]:
# change heart_disease answer from integers to string
df['heart_disease'] = df['heart_disease'].replace({0: 'No', 1: 'Yes'})
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,No,No,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,No,No,No,no_work,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,No,No,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,No,No,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,No,No,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [8]:
# change stroke answer from integers to string
df['stroke'] = df['stroke'].replace({0: 'No', 1: 'Yes'})
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
2,31112,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,60182,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,1665,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes
5,56669,Male,81.0,No,No,Yes,Private,Urban,186.21,29.0,formerly smoked,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,No,No,No,no_work,Rural,103.08,18.6,Unknown,No
5106,44873,Female,81.0,No,No,Yes,Self-employed,Urban,125.20,40.0,never smoked,No
5107,19723,Female,35.0,No,No,Yes,Self-employed,Rural,82.99,30.6,never smoked,No
5108,37544,Male,51.0,No,No,Yes,Private,Rural,166.29,25.6,formerly smoked,No


In [9]:
# drop id column
df.drop(['id'], axis=1, inplace=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,No,Yes,Yes,Private,Urban,228.69,36.6,formerly smoked,Yes
2,Male,80.0,No,Yes,Yes,Private,Rural,105.92,32.5,never smoked,Yes
3,Female,49.0,No,No,Yes,Private,Urban,171.23,34.4,smokes,Yes
4,Female,79.0,Yes,No,Yes,Self-employed,Rural,174.12,24.0,never smoked,Yes
5,Male,81.0,No,No,Yes,Private,Urban,186.21,29.0,formerly smoked,Yes
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,No,No,No,no_work,Rural,103.08,18.6,Unknown,No
5106,Female,81.0,No,No,Yes,Self-employed,Urban,125.20,40.0,never smoked,No
5107,Female,35.0,No,No,Yes,Self-employed,Rural,82.99,30.6,never smoked,No
5108,Male,51.0,No,No,Yes,Private,Rural,166.29,25.6,formerly smoked,No


In [11]:
# save csv

df.to_csv('stroke_data_labels.csv', index=True)

Transfer Data to SQL

In [14]:
stroke_data = pd.read_csv("stroke_data_labels.csv")

In [15]:
from sqlalchemy import create_engine
from config import db_password
import time


In [16]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/finalproject"

In [17]:
engine = create_engine(db_string)


In [20]:
stroke_data.to_sql(name='stroke_data1', con=engine)


NameError: name 'stroke_data1' is not defined

In [21]:
rows_imported = 0
# get the start_time from time.time()
start_time = time.time()
for data in pd.read_csv('stroke_data_labels.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='stroke_data1', con=engine, if_exists='append')
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

importing rows 0 to 4909...Done. 0.34018445014953613 total seconds elapsed
