# Data Analysis

### 1. Load the Data from Google Sheets

In [787]:
# Import python library gspread to read the Google Sheet
import gspread

In [788]:
# Use the service_account method to give service account details to gspread
sa = gspread.service_account(filename="extended-pagoda-419111-6c5d3c5c6b77.json")

In [789]:
# Use the sa gspread client to connect to the Google Sheet
sheet = sa.open("2024: Body Composition Tracking")

# Use the worksheet method to define worksheet to open
work_sheet = sheet.worksheet("Weighings")

In [790]:
# Import pandas to extract the data
import pandas as pd

# Import numpy to manipulate the data
import numpy as np

# Use the get_all_records to extract the data to a dataframe
df = pd.DataFrame(work_sheet.get_all_records())

### 2. Understanding the Data

In [791]:
# Review the dimensions of the dataset
print(df.shape)

(190, 16)


In [792]:
# Look at the data types for each column
df.dtypes

                        object
DATE                    object
TIME                    object
WEIGHT                  object
BMI                     object
BODY FAT                object
FAT-FREE BODY WEIGHT    object
SUB FAT                 object
VIS FAT                 object
BODY WATER              object
SKE MUSCLE              object
MUSCLE MASS             object
BONE MASS               object
PROTEIN                 object
BMR                     object
AGE                     object
dtype: object

In [793]:
# Read the first 5 rows of the dataset
df.head()

Unnamed: 0,Unnamed: 1,DATE,TIME,WEIGHT,BMI,BODY FAT,FAT-FREE BODY WEIGHT,SUB FAT,VIS FAT,BODY WATER,SKE MUSCLE,MUSCLE MASS,BONE MASS,PROTEIN,BMR,AGE
0,,1 Nov,07:03,167.4,24.8,24.1%,127.1,16.2%,9.5,52.0%,52.2%,119.0,6.5,19.9%,1619,38
1,,2 Nov,07:03,166.2,24.6,23.8%,126.6,16.0%,9.4,52.2%,52.4%,118.6,6.6,20.0%,1614,38
2,,9 Nov,07:04,166.4,24.6,23.9%,126.6,16.1%,9.4,52.1%,52.3%,118.6,6.6,20.0%,1615,38
3,,16 Nov,07:04,166.6,24.7,24.0%,126.6,16.1%,9.5,52.1%,52.2%,118.6,6.5,20.0%,1616,38
4,,23 Nov,07:03,165.8,24.6,23.8%,126.3,16.0%,9.4,52.2%,52.4%,118.5,6.6,20.1%,1611,38


In [794]:
# Read the last 5 rows of the dataset
df.tail()

Unnamed: 0,Unnamed: 1,DATE,TIME,WEIGHT,BMI,BODY FAT,FAT-FREE BODY WEIGHT,SUB FAT,VIS FAT,BODY WATER,SKE MUSCLE,MUSCLE MASS,BONE MASS,PROTEIN,BMR,AGE
185,,28 May,,,,,,,,,,,,,,
186,,29 May,,,,,,,,,,,,,,
187,,30 May,,,,,,,,,,,,,,
188,,31 May,,,,,,,,,,,,,,
189,,1 Jun,,,,,,,,,,,,,,


In [795]:
# Return a list of column names
df.columns.values.tolist()

['',
 'DATE',
 'TIME',
 'WEIGHT',
 'BMI',
 'BODY FAT',
 'FAT-FREE BODY WEIGHT',
 'SUB FAT',
 'VIS FAT',
 'BODY WATER',
 'SKE MUSCLE',
 'MUSCLE MASS',
 'BONE MASS',
 'PROTEIN',
 'BMR',
 'AGE']

### 3. Check for missing values

In [796]:
# In the list of columns, there is an empty column with potentially no data
# Review the contents of the column
print(df[""].tolist)

<bound method IndexOpsMixin.tolist of 0       
1       
2       
3       
4       
      ..
185     
186     
187     
188     
189     
Name: , Length: 190, dtype: object>


In [797]:
# As the column has no data, this will be dropped
df.drop("", axis = 1, inplace=True)

# Return column names to see the updated list of columns
df.columns.values

array(['DATE', 'TIME', 'WEIGHT', 'BMI', 'BODY FAT',
       'FAT-FREE BODY WEIGHT', 'SUB FAT', 'VIS FAT', 'BODY WATER',
       'SKE MUSCLE', 'MUSCLE MASS', 'BONE MASS', 'PROTEIN', 'BMR', 'AGE'],
      dtype=object)

In [798]:
# Look for missing values
df.isnull().sum()

DATE                    0
TIME                    0
WEIGHT                  0
BMI                     0
BODY FAT                0
FAT-FREE BODY WEIGHT    0
SUB FAT                 0
VIS FAT                 0
BODY WATER              0
SKE MUSCLE              0
MUSCLE MASS             0
BONE MASS               0
PROTEIN                 0
BMR                     0
AGE                     0
dtype: int64

In [799]:
# In the tail there are empty strings for some future dates

# Find empty strings elsewhere

In [800]:
# As we don't have the data for the empty strings, we will replace them and the empty strings for future dates

# Replace with NaN
df = df.replace(r'^s*$', np.nan, regex=True)
df.tail()

Unnamed: 0,DATE,TIME,WEIGHT,BMI,BODY FAT,FAT-FREE BODY WEIGHT,SUB FAT,VIS FAT,BODY WATER,SKE MUSCLE,MUSCLE MASS,BONE MASS,PROTEIN,BMR,AGE
185,28 May,,,,,,,,,,,,,,
186,29 May,,,,,,,,,,,,,,
187,30 May,,,,,,,,,,,,,,
188,31 May,,,,,,,,,,,,,,
189,1 Jun,,,,,,,,,,,,,,


In [801]:
# Drop rows that contain NaN
df = df.dropna()
df.tail()

Unnamed: 0,DATE,TIME,WEIGHT,BMI,BODY FAT,FAT-FREE BODY WEIGHT,SUB FAT,VIS FAT,BODY WATER,SKE MUSCLE,MUSCLE MASS,BONE MASS,PROTEIN,BMR,AGE
126,30 Mar,07:05,159.2,23.6,22.1%,124.0,14.9%,8.7,53.2%,53.5%,116.2,6.5,20.7%,1574.0,38.0
127,31 Mar,09:47,160.8,23.8,22.6%,124.5,15.2%,8.9,52.9%,53.2%,116.7,6.4,20.5%,1583.0,38.0
128,1 Apr,07:06,161.2,23.9,22.6%,124.8,15.2%,8.9,52.9%,53.2%,117.0,6.4,20.5%,1585.0,38.0
129,2 Apr,08:27,160.2,23.7,22.4%,124.3,15.1%,8.8,53.0%,53.3%,116.4,6.4,20.6%,1580.0,38.0
130,3 Apr,07:04,160.2,23.7,22.4%,124.3,15.1%,8.8,53.0%,53.4%,116.6,6.4,20.6%,1580.0,38.0


In [802]:
# Validate that there are no remaining NaN values
df.isnull().sum()

DATE                    0
TIME                    0
WEIGHT                  0
BMI                     0
BODY FAT                0
FAT-FREE BODY WEIGHT    0
SUB FAT                 0
VIS FAT                 0
BODY WATER              0
SKE MUSCLE              0
MUSCLE MASS             0
BONE MASS               0
PROTEIN                 0
BMR                     0
AGE                     0
dtype: int64

### 4. Data Cleaning

In [803]:
# Review updated data types for each column
df.dtypes

DATE                     object
TIME                     object
WEIGHT                  float64
BMI                     float64
BODY FAT                 object
FAT-FREE BODY WEIGHT    float64
SUB FAT                  object
VIS FAT                 float64
BODY WATER               object
SKE MUSCLE               object
MUSCLE MASS             float64
BONE MASS               float64
PROTEIN                  object
BMR                     float64
AGE                     float64
dtype: object

In [804]:
# Make columns easier to work with

# Update column names to be lowercase
df.rename(columns=str.lower, inplace=True)

# Replace spaces in column names with hyphens
df.columns = df.columns.str.replace(' ', '-')

In [805]:
# Update column names related to weight to reflect that values are in lbs
df.rename(columns={"weight":"weight(lbs)", "fat-free-body-weight":"fat-free-weight(lbs)", "muscle-mass":"muscle-mass(lbs)","bone-mass":"bone-mass(lbs)"}, inplace=True)

In [806]:
# Convert columns with percentage values to float data type

# Strip out percentage sign
df["body-fat"] = df["body-fat"].str.replace("%", "")
df["sub-fat"] = df["sub-fat"].str.replace("%", "")
df["body-water"] = df["body-water"].str.replace("%", "")
df["ske-muscle"] = df["ske-muscle"].str.replace("%", "")
df["protein"] = df["protein"].str.replace("%", "")

# Convert percentage columns to floats
df[["body-fat", "sub-fat"]] = df[["body-fat", "sub-fat"]].astype(float)
df[["body-water", "ske-muscle", "protein"]] = df[["body-water", "ske-muscle", "protein"]].astype(float)

# Divide values in percentage columns by 100
df[["body-fat", "sub-fat", "body-water", "ske-muscle", "protein"]] = df[["body-fat", "sub-fat", "body-water", "ske-muscle", "protein"]].div(100)

In [807]:
# Update column names to reflect that the column values are percentages
df.rename(columns={"body-fat":"body-fat(%)", "sub-fat":"sub-fat(%)", "body-water":"body-water(%)", "ske-muscle":"ske-muscle(%)", "protein":"protein(%)"}, inplace=True)

In [808]:
# Review updated data types
df.dtypes

date                     object
time                     object
weight(lbs)             float64
bmi                     float64
body-fat(%)             float64
fat-free-weight(lbs)    float64
sub-fat(%)              float64
vis-fat                 float64
body-water(%)           float64
ske-muscle(%)           float64
muscle-mass(lbs)        float64
bone-mass(lbs)          float64
protein(%)              float64
bmr                     float64
age                     float64
dtype: object

In [None]:
# Convert bmr and age columns to integers
df[["bmr", "age"]] = df[["bmr", "age"]].astype(int)

In [None]:
#df['time']=pd.to_datetime(df['time'])
#df['date']=pd.to_datetime(df['date'])

In [None]:
# Get a statistical summary of each column
df.describe()