Change column contents from integers to strings, upload csv to SQL


In [1]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
# read csv file
df = pd.read_csv("./Datasets/train_strokes.csv")


In [3]:
# drop id column
df.drop(['id'], axis=1, inplace=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [4]:
#https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
cat_cols = df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
Residence_type,Rural,21644
Residence_type,Urban,21756
ever_married,No,15462
ever_married,Yes,27938
gender,Other,11
gender,Male,17724
gender,Female,25665
smoking_status,smokes,6562
smoking_status,formerly smoked,7493
smoking_status,never smoked,16053


In [5]:
# drop 'other' gender - we do not know the premise of that selection
df = df[df.gender != 'Other']
print(f"{df['gender'].value_counts()}")

Female    25665
Male      17724
Name: gender, dtype: int64


In [6]:
# merge 'children' and 'Never_worked' categories
df = df.replace({'work_type': {'Never_worked':'Never_worked', 'children': 'Never_worked'}})
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,No,Never_worked,Rural,95.12,18.0,,0
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,0,0,No,Never_worked,Urban,58.64,20.4,never smoked,0
43396,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [7]:
# change hypertension answer from integers to string
df['hypertension'] = df['hypertension'].replace({0: 'No', 1: 'Yes'})
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,No,0,No,Never_worked,Rural,95.12,18.0,,0
1,Male,58.0,Yes,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,No,0,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,No,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,No,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,No,0,No,Never_worked,Urban,58.64,20.4,never smoked,0
43396,Female,56.0,No,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82.0,Yes,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40.0,No,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [8]:
# change heart_disease answer from integers to string
df['heart_disease'] = df['heart_disease'].replace({0: 'No', 1: 'Yes'})
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,No,No,No,Never_worked,Rural,95.12,18.0,,0
1,Male,58.0,Yes,No,Yes,Private,Urban,87.96,39.2,never smoked,0
2,Female,8.0,No,No,No,Private,Urban,110.89,17.6,,0
3,Female,70.0,No,No,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,Male,14.0,No,No,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,No,No,No,Never_worked,Urban,58.64,20.4,never smoked,0
43396,Female,56.0,No,No,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,Female,82.0,Yes,No,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,Male,40.0,No,No,Yes,Private,Urban,99.16,33.2,never smoked,0


In [9]:
# change stroke answer from integers to string
df['stroke'] = df['stroke'].replace({0: 'No', 1: 'Yes'})
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,No,No,No,Never_worked,Rural,95.12,18.0,,No
1,Male,58.0,Yes,No,Yes,Private,Urban,87.96,39.2,never smoked,No
2,Female,8.0,No,No,No,Private,Urban,110.89,17.6,,No
3,Female,70.0,No,No,Yes,Private,Rural,69.04,35.9,formerly smoked,No
4,Male,14.0,No,No,No,Never_worked,Rural,161.28,19.1,,No
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,No,No,No,Never_worked,Urban,58.64,20.4,never smoked,No
43396,Female,56.0,No,No,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,No
43397,Female,82.0,Yes,No,Yes,Private,Urban,91.94,28.9,formerly smoked,No
43398,Male,40.0,No,No,Yes,Private,Urban,99.16,33.2,never smoked,No


In [10]:
df = df.fillna(0)

df[(df['bmi'] == 0) & (df['stroke'] == 1)]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke


In [11]:
#https://stackoverflow.com/questions/39690742/convert-float-to-int-and-leave-nulls
#df['b'] = df['b'].astype('Int64')

df['bmi'] = df['bmi'].values.astype(int)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,No,No,No,Never_worked,Rural,95.12,18,0,No
1,Male,58.0,Yes,No,Yes,Private,Urban,87.96,39,never smoked,No
2,Female,8.0,No,No,No,Private,Urban,110.89,17,0,No
3,Female,70.0,No,No,Yes,Private,Rural,69.04,35,formerly smoked,No
4,Male,14.0,No,No,No,Never_worked,Rural,161.28,19,0,No
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,No,No,No,Never_worked,Urban,58.64,20,never smoked,No
43396,Female,56.0,No,No,Yes,Govt_job,Urban,213.61,55,formerly smoked,No
43397,Female,82.0,Yes,No,Yes,Private,Urban,91.94,28,formerly smoked,No
43398,Male,40.0,No,No,Yes,Private,Urban,99.16,33,never smoked,No


In [12]:
# check to see how many 0's are in the 'bmi' column
df[(df['bmi'] == 0) & (df['stroke'] == 1)]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke


In [13]:
# Replace 'bmi' 0 values with mean of bmi
mean_bmi = np.round(df['bmi'].mean())
print(mean_bmi)

27.0


In [14]:
# Replace 'bmi' 0 values with mean of bmi
df['bmi'] = df['bmi'].replace(0, mean_bmi)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,No,No,No,Never_worked,Rural,95.12,18,0,No
1,Male,58.0,Yes,No,Yes,Private,Urban,87.96,39,never smoked,No
2,Female,8.0,No,No,No,Private,Urban,110.89,17,0,No
3,Female,70.0,No,No,Yes,Private,Rural,69.04,35,formerly smoked,No
4,Male,14.0,No,No,No,Never_worked,Rural,161.28,19,0,No
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,No,No,No,Never_worked,Urban,58.64,20,never smoked,No
43396,Female,56.0,No,No,Yes,Govt_job,Urban,213.61,55,formerly smoked,No
43397,Female,82.0,Yes,No,Yes,Private,Urban,91.94,28,formerly smoked,No
43398,Male,40.0,No,No,Yes,Private,Urban,99.16,33,never smoked,No


In [15]:
# All NaN werer replaced with 0, so we need to replace 0 in 'smoking_status' column to 'unknown'
df['smoking_status'] = df['smoking_status'].replace(to_replace=0, value="unknown") 

In [16]:
df[(df['smoking_status'] == 'unknown') & (df['stroke'] == 1)]

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke


In [17]:
df.describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,43389.0,43389.0,43389.0
mean,42.219747,104.48676,28.116804
std,22.52075,43.115157,7.640829
min,0.08,55.0,10.0
25%,24.0,77.54,23.0
50%,44.0,91.58,27.0
75%,60.0,112.07,32.0
max,82.0,291.05,97.0


In [18]:
# bin avg_glucose_level
avg_glucose_lvl = df['avg_glucose_level']
glucose_lvl_bins = [0, 69, 100, 126, 292]
glucose_lvl_labels = ['<70','70-100','101-125','>126']

df['avg_glucose_level'] = pd.cut(avg_glucose_lvl, bins=glucose_lvl_bins, labels=glucose_lvl_labels)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,No,No,No,Never_worked,Rural,70-100,18,unknown,No
1,Male,58.0,Yes,No,Yes,Private,Urban,70-100,39,never smoked,No
2,Female,8.0,No,No,No,Private,Urban,101-125,17,unknown,No
3,Female,70.0,No,No,Yes,Private,Rural,70-100,35,formerly smoked,No
4,Male,14.0,No,No,No,Never_worked,Rural,>126,19,unknown,No
...,...,...,...,...,...,...,...,...,...,...,...
43395,Female,10.0,No,No,No,Never_worked,Urban,<70,20,never smoked,No
43396,Female,56.0,No,No,Yes,Govt_job,Urban,>126,55,formerly smoked,No
43397,Female,82.0,Yes,No,Yes,Private,Urban,70-100,28,formerly smoked,No
43398,Male,40.0,No,No,Yes,Private,Urban,70-100,33,never smoked,No


In [19]:
# check null values
for i in df.columns:
    x = df[i].isna().value_counts()
    print("Column name is:",i,"and the amount of null values is:",x)

Column name is: gender and the amount of null values is: False    43389
Name: gender, dtype: int64
Column name is: age and the amount of null values is: False    43389
Name: age, dtype: int64
Column name is: hypertension and the amount of null values is: False    43389
Name: hypertension, dtype: int64
Column name is: heart_disease and the amount of null values is: False    43389
Name: heart_disease, dtype: int64
Column name is: ever_married and the amount of null values is: False    43389
Name: ever_married, dtype: int64
Column name is: work_type and the amount of null values is: False    43389
Name: work_type, dtype: int64
Column name is: Residence_type and the amount of null values is: False    43389
Name: Residence_type, dtype: int64
Column name is: avg_glucose_level and the amount of null values is: False    43389
Name: avg_glucose_level, dtype: int64
Column name is: bmi and the amount of null values is: False    43389
Name: bmi, dtype: int64
Column name is: smoking_status and the 

In [20]:
#https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
cat_cols = df.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(
    df[cat_cols]
    .melt(var_name='column', value_name='value')
    .value_counts())
.rename(columns={0: 'counts'})
.sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
Residence_type,Rural,21638
Residence_type,Urban,21751
ever_married,No,15456
ever_married,Yes,27933
gender,Male,17724
gender,Female,25665
heart_disease,Yes,2062
heart_disease,No,41327
hypertension,Yes,4061
hypertension,No,39328


In [22]:
# save csv

#df.to_csv('Datasets/train_stroke_data_labels.csv', index=True)

Transfer Data to SQL

In [25]:
stroke_data = pd.read_csv("Datasets/train_stroke_data_labels.csv")

In [26]:
from sqlalchemy import create_engine
from config import db_password
import time


In [27]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/strokerisk"

In [28]:
engine = create_engine(db_string)


In [29]:
stroke_data.to_sql(name='stroke_data_labels', con=engine)
