# Introduction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from scipy.stats import describe
from sklearn.linear_model import LinearRegression

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv(path)

Using data from the National Health and Nutrition Examination Survey, we will prove or disprove the assumption that the relationship between weight and height of a person can be determined by a simple formula <br>
$ weight = height-100 $ <br>

For this, from the whole series of features, we will use only
* Weight (kg) - weight
* Standing Height (cm) - standing height
* BMI (kg / m ** 2) - body mass index

https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_BMX.htm

In [None]:
X_columns = ['Weight (kg)', 'Standing Height (cm)', 'BMI(kg/m**2)']

In [None]:
df.head()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

for i in range(len(X_columns)):
  sns.histplot(data=df, 
               x=X_columns[i], 
               kde=True, 
               linewidth=0, 
               ax=ax[i])
ax[2].plot([18.5, 18.5], [1, 80], color='red')
ax[2].plot([25, 25], [1, 410], color='red')

fig.suptitle('Distribution of weight and height', fontsize=16)
plt.show()

The distributions of weight, height and BMI are normal, but the mass is shifted to the right. Among them, the values prevail, significantly higher than the average.

# Normal body mass index

Consider the studied dependence on observations with BMI from 18.5 to 24.9, corresponding to the normal body mass index. It should be noted right away that there are fewer such observations.

In [None]:
is_norm_BMI = np.array((18.5 < df['BMI(kg/m**2)']) & (df['BMI(kg/m**2)'] < 24.9))
unique, counts = np.unique(is_norm_BMI, return_counts=True)

plt.pie(x=counts, explode=[0, 0.1], labels=['Not normal', 'Normal'], autopct='%1.1f%%')
plt.title('Count normal weight')
plt.show()


In [None]:
#df_n is df from normal BMI 
df_n = df[(18.5 < df['BMI(kg/m**2)']) & (df['BMI(kg/m**2)'] < 24.9)]
df_n['Height-100'] = df['Standing Height (cm)']-100

df_n.head()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

sns.kdeplot(data=df_n, 
            x='Weight (kg)', 
            label='Weight (kg)',
            ax=ax[0])
sns.kdeplot(data=df_n, 
            x='Height-100', 
            label='Height-100', 
            ax=ax[0])
ax[0].legend()

sns.boxplot(data=df_n[['Weight (kg)', 'Height-100']], 
            ax=ax[1], 
            notch=True)


fig.suptitle('Distribution of weight and height-100')
plt.show()

In [None]:
df_n[['Weight (kg)', 'Height-100']].describe()

On average, we have that the values of height-100 are greater than the values of weight.
Let us establish how statistically significant these differences are.

Let <br>
$ H_0 $ - for the values of weight and height-100, the averages are equal <br>
$ H_1 $ - for values of weight and height-100, the means differ

In [None]:
t_stud, p = ttest_ind(df_n['Height-100'], df_n['Weight (kg)'])
print('t={:.2f}, p={:.2e}'.format(t_stud, p))

Student's t-test showed that the mean values have statistically significant differences, we can reject the null hypothesis.

Next, consider how much the values differ.

In [None]:
diff = np.array(df_n['Height-100'] - df_n['Weight (kg)'])

plt.hist(diff, bins=20)
plt.xlabel('Difference')
plt.ylabel('Count')
plt.title('Differences Height-100 and Weight')
plt.show()

In [None]:
print('mean: {:.2f}'.format(diff.mean()))
print('max: {:.2f}'.format(diff.max()))
print('min: {:.2f}'.format(diff.min()))
print('std: {:.2f}'.format(diff.std()))

On average, the indicators differ by 4.26 points.

# Overweight

Рассмотрим значения роста и веса для людей с BMI от 25 до 30.

In [None]:
is_norm_BMI = np.array((24.9 <= df['BMI(kg/m**2)']) & (df['BMI(kg/m**2)'] < 30))
unique, counts = np.unique(is_norm_BMI, return_counts=True)

plt.pie(x=counts, explode=[0, 0.1], labels=['Other', 'Overweight'], autopct='%1.1f%%')
plt.title('Count overweight')
plt.show()

In [None]:
# df_o is df from overweight
df_o = df[(24.9 <= df['BMI(kg/m**2)']) & (df['BMI(kg/m**2)'] < 30)]
df_o['Height-100'] = df['Standing Height (cm)']-100

df_o.head()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

sns.kdeplot(data=df_o, 
            x='Weight (kg)', 
            label='Weight (kg)',
            ax=ax[0])
sns.kdeplot(data=df_o, 
            x='Height-100', 
            label='Height-100', 
            ax=ax[0])
ax[0].legend()

sns.boxplot(data=df_o[['Weight (kg)', 'Height-100']], 
            ax=ax[1], 
            notch=True)


fig.suptitle('Distribution of weight and height-100')
plt.show()

There is also a difference here, but now the actual weight is higher than expected.

In [None]:
diff = np.array(df_o['Height-100'] - df_o['Weight (kg)'])

plt.hist(diff, bins=20)
plt.xlabel('Difference')
plt.ylabel('Count')
plt.title('Differences Height-100 and Weight')
plt.show()

In [None]:
print('mean: {:.2f}'.format(diff.mean()))
print('max: {:.2f}'.format(diff.max()))
print('min: {:.2f}'.format(diff.min()))
print('std: {:.2f}'.format(diff.std()))

On average, the indicators differ by 9.68 points versus 4.26 for people with a normal body mass index.

Thus, we can say that for people with normal BMI and overweight, the values for weight and height -100 are on average similar. <br>
$ weight \approx height-100 $ <br>
At the same time, one cannot categorically demand equality. Obviously, the stronger the deviation of body weight from the norm, the less plausible such a ratio becomes.

# When equality is true

Let's see in what cases this equality is satisfied.

In [None]:
df['Height-100'] = df['Standing Height (cm)']-100
df['Diff'] = df['Height-100'] - df['Weight (kg)']

df.head()

In [None]:
df_e = df[(-100 < df['Diff']) & (df['Diff'] < 100)]
df_e.head()

In [None]:
sns.jointplot(data=df_e, x='BMI(kg/m**2)', y='Diff')
plt.show()

The smallest differences are in the group with BMI from 20 to 25, i.e. with a normal body mass index.

# Weight and height dependence

In [None]:
sns.scatterplot(data=df, x='Standing Height (cm)', y='Weight (kg)')
plt.plot([140, 200], [40, 100], linewidth=3, color='red')
plt.plot()

There is no strong direct relationship between height and weight in the data. Assuming equality <br>
$ weight = height-100 $ <br>
is executed, the points on the graph should be approximated by a straight line, but most of the points are above it. Those. on average, the real weight is greater than that predicted by this ratio.

Let's build a linear regression model to establish the relationship between height and weight.

In [None]:
x = df['Standing Height (cm)'].values
y = df['Weight (kg)'].values

x = x.reshape(-1, 1)

In [None]:
lr = LinearRegression()
lr.fit(x, y)

In [None]:
k = lr.coef_[0]
b = lr.intercept_

print('k = {:.2f}'.format(k))
print('b = {:.2f}'.format(b))

In [None]:
sns.scatterplot(data=df, x='Standing Height (cm)', y='Weight (kg)')
plt.plot([140, 200], [40, 100], linewidth=2, color='red')
plt.plot([140, 200], [140*k+b, 200*k+b], linewidth=3, color='yellow')
plt.plot()

The formula for calculating the dependence of weight on height, obtained by the linear regression model, looks like this <br>
$ weight = 0.97 \times height-78 $ <br>

Let's check how much the values obtained by the new formula are closer to the real weight.

In [None]:
df['Height-80'] = df['Standing Height (cm)']-80
df['Height-90'] = df['Standing Height (cm)']-90

In [None]:
weights = ['Weight (kg)', 'Height-100', 'Height-90', 'Height-80']

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

for i in range(len(weights)):
  sns.kdeplot(data=df, 
              x=weights[i], 
              label=weights[i],
              ax=ax[0])

ax[0].legend()

sns.boxplot(data=df[weights], ax=ax[1], notch=True)

fig.suptitle('Distributions')
plt.show()

Central trend measures for real weight are similar to height-90.

# Outcome

The considered formula can be used to describe the dependence of weight on height if we want to obtain the desired weight values for the normal body mass index.

However, it does not reflect the real situation. In the presented observations, the weight on average is much higher than the predicted one. Therefore, this dependence is defined a little more accurately as <br>
$ weight = 0.97 \times height-78 $. <br>
But even in this case, the spread in values is too great to be linearly approximated.