# 1. Exploratory Data Analysis

---

<span style = "font-family: Arial; font-weight:bold;font-size:2em;color:black;">Importing Libraries

In [1]:
import pyodbc
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

<span style = "font-family: Arial; font-weight:bold;font-size:2em;color:black;">Database connection

In [2]:
# Define your MSSQL server connection parameters
Server   = "SeanSD\SQLEXPRESS"
Database = "health"
UID      = ""
PWD      = ""

connection_str = ("Driver={SQL Server Native Client 11.0};"
                  "Server=" + Server + ";"
                  "Database=" + Database + ";"
                  "Trusted_Connection=yes;")

try:
    connection = pyodbc.connect(connection_str)
    print ("Connected to SQL database Server:", Server, "\nDatabase: " + Database)
except pyodbc.Error as ex:
    sqlstate = ex.args[1]
    print ("Unable to connect: ", sqlstate)

Connected to SQL database Server: SeanSD\SQLEXPRESS 
Database: health


---

## A look at the dataset

Let's take a look at the first 10 rows from the `user_logs` table.

In [3]:
query_str = """
SELECT TOP (10) *
FROM user_logs;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,id,log_date,measure,measure_value,systolic,diastolic
0,fa28f948a740320ad56b81a24744c8b81df119fa,2020-11-15,weight,46.03959,,
1,1a7366eef15512d8f38133e7ce9778bce5b4a21e,2020-10-10,blood_glucose,97.0,0.0,0.0
2,bd7eece38fb4ec71b3282d60080d296c4cf6ad5e,2020-10-18,blood_glucose,120.0,0.0,0.0
3,0f7b13f3f0512e6546b8d2c0d56e564a2408536a,2020-10-17,blood_glucose,232.0,0.0,0.0
4,d14df0c8c1a5f172476b2a1b1f53cf23c6992027,2020-10-15,blood_pressure,140.0,140.0,113.0
5,0f7b13f3f0512e6546b8d2c0d56e564a2408536a,2020-10-21,blood_glucose,166.0,0.0,0.0
6,0f7b13f3f0512e6546b8d2c0d56e564a2408536a,2020-10-22,blood_glucose,142.0,0.0,0.0
7,87be2f14a5550389cb2cba03b3329c54c993f7d2,2020-10-12,weight,129.060013,0.0,0.0
8,0efe1f378aec122877e5f24f204ea70709b1f5f8,2020-10-07,blood_glucose,138.0,0.0,0.0
9,054250c692e07a9fa9e62e345231df4b54ff435d,2020-10-04,blood_glucose,210.0,,


## Unique measures

In [4]:
query_str = """SELECT DISTINCT measure FROM user_logs;"""

pd.read_sql(query_str, connection)

Unnamed: 0,measure
0,weight
1,blood_pressure
2,blood_glucose


## Structure Summary

In [5]:
# Table you want to examine
table_name = 'user_logs'

# Get column information for the specific table
cursor = connection.cursor()
columns_query = f"""
SELECT COLUMN_NAME, DATA_TYPE
FROM information_schema.columns
WHERE table_name = '{table_name}'
"""
cursor.execute(columns_query)
columns = cursor.fetchall()

# Get row count for the specific table
row_count_query = f"""
SELECT COUNT(*) FROM {table_name}
"""
cursor.execute(row_count_query)
row_count = cursor.fetchone()[0]

print(f"Table: \033[1m{table_name}\033[0m\n")
print(f"Columns: \033[1m{len(columns)}\033[0m")
print(f"Rows: \033[1m{row_count}\033[0m\n")

# Display column information
for column in columns:
    column_name, data_type = column
    print(f"\tColumn: \033[1m{column_name}\033[0m ({data_type})")

# Initialize a dictionary to store the count of missing values for each column
missing_value_counts = {}

# Loop through the columns and calculate the count of missing values
for column in columns:
    column_name = column.COLUMN_NAME
    missing_value_query = f"""
    SELECT COUNT(*) FROM {table_name}
    WHERE {column_name} IS NULL
    """
    cursor.execute(missing_value_query)
    missing_value_count = cursor.fetchone()[0]
    if missing_value_count > 0:
        missing_value_counts[column_name] = missing_value_count

# Display the columns with missing values
print(f"\nColumns with Missing Values in {table_name}:")
for column_name, missing_count in missing_value_counts.items():
    print(f"\t\033[1m{column_name}\033[0m {missing_count}")
    
# Close the cursor
cursor.close()


Table: [1muser_logs[0m

Columns: [1m6[0m
Rows: [1m43891[0m

	Column: [1mid[0m (nvarchar)
	Column: [1mlog_date[0m (datetime)
	Column: [1mmeasure[0m (nvarchar)
	Column: [1mmeasure_value[0m (float)
	Column: [1msystolic[0m (float)
	Column: [1mdiastolic[0m (float)

Columns with Missing Values in user_logs:
	[1msystolic[0m 26023
	[1mdiastolic[0m 26023


## Total record count

Let's also take a look at the total record count.

In [6]:
query_str = """
SELECT 
  COUNT(*) AS 'count'
FROM user_logs;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,count
0,43891


## Unique column count

We'll take a look at how many unique id's are present in the dataset. That'll give us a count of the total number of users.

In [7]:
query_str = """
SELECT COUNT(DISTINCT id) AS 'count'
FROM user_logs;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,count
0,554


## Single column frequency counts

Let's take a look at the measure column and see frequency and the percentage count of each value across the table.

In [8]:
query_str = """
SELECT 
  measure,
  COUNT(*) AS frequency,
  ROUND(100.0* COUNT(*)/SUM(COUNT(*)) OVER(), 2) AS percentage
FROM user_logs
GROUP BY measure
ORDER BY frequency DESC;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,measure,frequency,percentage
0,blood_glucose,38692,88.15
1,weight,2782,6.34
2,blood_pressure,2417,5.51


Let's also see the frequency of unique id's that appear in the dataset and limit the output to just 5.

In [9]:
query_str = """
SELECT TOP (5)
  id,
  COUNT(*) AS frequency,
  ROUND(100.0* COUNT(*)/SUM(COUNT(*)) OVER(),
  2) AS percentage
FROM user_logs
GROUP BY id
ORDER BY frequency DESC;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,id,frequency,percentage
0,054250c692e07a9fa9e62e345231df4b54ff435d,22325,50.86
1,0f7b13f3f0512e6546b8d2c0d56e564a2408536a,1589,3.62
2,ee653a96022cc3878e76d196b1667d95beca2db6,1235,2.81
3,abc634a555bbba7d6d6584171fdfa206ebf6c9a0,1212,2.76
4,576fdb528e5004f733912fae3020e7d322dbc31a,1018,2.32


## Individual column distribution

Let's now take a look at the most frequent values across each column.

`Measure Value Column`

In [10]:
query_str = """
SELECT TOP(10)
  measure_value,
  COUNT(*) AS frequency
FROM user_logs
GROUP BY measure_value
ORDER BY frequency DESC;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,measure_value,frequency
0,0.0,572
1,401.0,433
2,117.0,390
3,118.0,346
4,123.0,342
5,122.0,331
6,126.0,326
7,120.0,323
8,115.0,319
9,108.0,319


`Systolic column`

In [11]:
query_str = """
SELECT TOP(10)
  systolic,
  COUNT(*) AS frequency
FROM user_logs
GROUP BY systolic
ORDER BY frequency DESC;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,systolic,frequency
0,,26023
1,0.0,15451
2,120.0,71
3,123.0,70
4,128.0,66
5,127.0,64
6,119.0,60
7,130.0,60
8,135.0,57
9,136.0,55


There are many null and zero values. We'll explore this later.

`Diastolic column`

In [12]:
query_str = """
SELECT  TOP(10)
  diastolic,
  COUNT(*) AS frequency
FROM user_logs
GROUP BY diastolic
ORDER BY frequency DESC;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,diastolic,frequency
0,,26023
1,0.0,15449
2,80.0,156
3,79.0,124
4,81.0,119
5,78.0,110
6,77.0,109
7,73.0,109
8,83.0,106
9,76.0,102


This is somewhat similar output if compared to systolic column.

## Deep dive into the specific values

Since there are many 0 values in the measure_value column and some large number of nulls in systolic, diastolic. 

Let's take a look to see if the measure_value = 0 only when there is a specific measure value. We can use the WHERE clause here.

In [13]:
query_str = """
SELECT 
  measure,
  COUNT(*) AS frequency
FROM user_logs
WHERE measure_value = 0
GROUP BY measure
ORDER BY frequency DESC;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,measure,frequency
0,blood_pressure,562
1,blood_glucose,8
2,weight,2


In [14]:
query_str = """
SELECT 
  measure,
  COUNT(*) AS frequency,
  ROUND(100.0* COUNT(*)/SUM(COUNT(*)) OVER(),
  2) AS percentage
FROM user_logs
GROUP BY measure
ORDER BY frequency DESC;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,measure,frequency,percentage
0,blood_glucose,38692,88.15
1,weight,2782,6.34
2,blood_pressure,2417,5.51


It appears that `measure_value` = 0 when the `measure` is blood_pressure.

Let's explore further.

In [15]:
query_str = """
SELECT TOP(10)
  measure,
  measure_value,
  systolic,
  diastolic
FROM user_logs
WHERE measure = 'blood_pressure'
AND measure_value = 0;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,measure,measure_value,systolic,diastolic
0,blood_pressure,0.0,115.0,76.0
1,blood_pressure,0.0,115.0,76.0
2,blood_pressure,0.0,105.0,70.0
3,blood_pressure,0.0,136.0,87.0
4,blood_pressure,0.0,164.0,84.0
5,blood_pressure,0.0,190.0,94.0
6,blood_pressure,0.0,125.0,79.0
7,blood_pressure,0.0,136.0,84.0
8,blood_pressure,0.0,135.0,89.0
9,blood_pressure,0.0,138.0,85.0


It appears that when blood_pressure is measured, the systolic and diastolic columns are populated but the measure_value is blank. 

Lets explore when the measure is blood_pressure but measure_value!=0.

In [16]:
query_str = """
SELECT TOP(10)
  measure,
  measure_value,
  systolic,
  diastolic
FROM user_logs
WHERE measure = 'blood_pressure'
AND measure_value is NOT NULL
"""
pd.read_sql(query_str, connection)

Unnamed: 0,measure,measure_value,systolic,diastolic
0,blood_pressure,140.0,140.0,113.0
1,blood_pressure,114.0,114.0,80.0
2,blood_pressure,132.0,132.0,94.0
3,blood_pressure,105.0,105.0,68.0
4,blood_pressure,149.0,149.0,85.0
5,blood_pressure,156.0,156.0,88.0
6,blood_pressure,142.0,142.0,84.0
7,blood_pressure,131.0,131.0,71.0
8,blood_pressure,128.0,128.0,77.0
9,blood_pressure,114.0,114.0,76.0


So, it looks like whenever blood_pressure is measured, measure_value is populated with systolic and sometimes it is equal to 0.

Let's check the same for the null values of systolic and diastolic.

In [17]:
query_str = """
SELECT TOP(10)
  measure,
  count(*)
FROM user_logs
WHERE systolic is NULL
GROUP BY measure
"""
pd.read_sql(query_str, connection)

Unnamed: 0,measure,Unnamed: 2
0,weight,443
1,blood_glucose,25580


This confirms that systolic only has non-null values when `measure` = **blood_pressure**. 

Let's see if it is similiar for the diastolic column.

In [18]:
query_str = """
SELECT TOP(10)
  measure,
  count(*)
FROM user_logs
WHERE diastolic is NULL
GROUP BY measure;
"""
pd.read_sql(query_str, connection)

Unnamed: 0,measure,Unnamed: 2
0,weight,443
1,blood_glucose,25580


Non-null values are only present when ``measure='blood_pressure'``.

---

**Close connection**

In [19]:
# Close the database connection
connection.close()

---

<a id="Question_7"></a>

<a id="Question_8"></a>

<a id="Question_9"></a>