In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
davis_df = pd.read_csv("/kaggle/input/davis-data-set/Davis.csv")

#Get rid of impossible values
davis_df = davis_df[davis_df.height > 140].dropna()

#Separate df to test and train saving proportions
davis_train, davis_test = train_test_split(davis_df, train_size=0.5, stratify=davis_df.sex)

In [None]:
#Plot the data
sns.scatterplot(data=davis_train, x='weight', y='height', hue='sex')

In [None]:
# Different hists
overall_weight_df = davis_df.weight
male_weight_df = davis_df[davis_df.sex == 'M'].weight
female_weight_df = davis_df[davis_df.sex == 'F'].weight

overall_height_df = davis_df.height
male_height_df = davis_df[davis_df.sex == 'M'].height
female_height_df = davis_df[davis_df.sex == 'F'].height

overall_weight_lie_df = davis_df.repwt - davis_df.weight
male_weight_lie_df = davis_df[davis_df.sex == 'M'].repwt - davis_df[davis_df.sex == 'M'].weight
female_weight_lie_df = davis_df[davis_df.sex == 'F'].repwt -davis_df[davis_df.sex == 'F'].weight

overall_height_lie_df = davis_df.repht - davis_df.height
male_height_lie_df = davis_df[davis_df.sex == 'M'].repht - davis_df[davis_df.sex == 'M'].height
female_height_lie_df = davis_df[davis_df.sex == 'F'].repht -davis_df[davis_df.sex == 'F'].height

fig, axes = plt.subplots(4, 3, figsize=(15,15))

axes[0,0].set_title("Overall weight")
axes[0,1].set_title("Male weight")
axes[0,2].set_title("Female weight")

axes[1,0].set_title("Overall height")
axes[1,1].set_title("Male height")
axes[1,2].set_title("Female height")

axes[2,0].set_title("Overall repwt - weight")
axes[2,1].set_title("Male repwt - weight")
axes[2,2].set_title("Female repwt - weight")

axes[3,0].set_title("Overall repht - height")
axes[3,1].set_title("Male repht - height")
axes[3,2].set_title("Female repht - height")

overall_weight_df.hist(bins=30, ax=axes[0,0])
male_weight_df.hist(bins=30, ax=axes[0,1])
female_weight_df.hist(bins=30, ax=axes[0,2])

overall_height_df.hist(bins=30, ax=axes[1,0])
male_height_df.hist(bins=30, ax=axes[1,1])
female_height_df.hist(bins=30, ax=axes[1,2])

overall_weight_lie_df.hist(bins=15, ax=axes[2,0])
male_weight_lie_df.hist(bins=15, ax=axes[2,1])
female_weight_lie_df.hist(bins=15, ax=axes[2,2])

overall_height_lie_df.hist(bins=15, ax=axes[3,0])
male_height_lie_df.hist(bins=15, ax=axes[3,1])
female_height_lie_df.hist(bins=15, ax=axes[3,2])

In [None]:
class HumanGenerator:
    def __init__(self):
        self.p_female = None;
        self.mean_height = {
            "female": None,
            "male": None
        }
        self.mean_weight = {
            "female": None,
            "male": None
        }
        self.covariance = {
            "female": None,
            "male": None
        }
    
    def fit(self, X):
        female_samples = X[X.sex == "F"]
        male_samples = X[X.sex == "M"]
        
        self.p_female = len(female_samples)/len(X)
        
        self.mean_height["female"] = np.mean(female_samples.height)
        self.mean_height["male"] = np.mean(male_samples.height)

        self.mean_weight["female"] = np.mean(female_samples.weight)
        self.mean_weight["male"] = np.mean(male_samples.weight)

        self.covariance["female"]  = np.cov(female_samples.weight, female_samples.height)
        self.covariance["male"]  = np.cov(male_samples.weight, male_samples.height)
        
    def generate_samples(self, n):
        result = np.empty((n, 2))
        result_sex = np.empty(n, dtype = object)
        for i in range(n):
            sex = np.random.choice(["female", "male"], p=[self.p_female, 1 - self.p_female])
            if(sex == "female"):
                result_sex[i] = "F"
            else:
                result_sex[i] = "M"
            result[i] = np.random.multivariate_normal(mean=[self.mean_weight[sex], self.mean_height[sex]], cov=self.covariance[sex])
        return pd.DataFrame(dict(sex = result_sex.ravel(), weight = result[:, 0].ravel(), height = result[:, 1].ravel()))
    
    def log_likehood(self, X):
        female_samples = X[X.sex == "F"][["weight", "height"]]
        male_samples = X[X.sex == "M"][["weight", "height"]]
        
        female_log_likehood = np.log(self.p_female *
            scipy.stats.multivariate_normal.pdf(
                female_samples, 
                mean=[self.mean_weight["female"],self.mean_height["female"]], 
                cov=self.covariance["female"]))
        male_log_likehood = np.log((1 - self.p_female) *
            scipy.stats.multivariate_normal.pdf(
                male_samples, 
                mean=[self.mean_weight["male"],self.mean_height["male"]], 
                cov=self.covariance["male"]))
        
        return np.append(female_log_likehood, male_log_likehood)
    
    def mean_log_likehood(self, X):
        return self.log_likehood(X).mean()

In [None]:
#Test generator
generator = HumanGenerator()
generator.fit(davis_train)

new_samples = generator.generate_samples(1000)
print(f"Train: {generator.mean_log_likehood(davis_train)}")
print(f"Test: {generator.mean_log_likehood(davis_test)}")
print(f"Generated: {generator.mean_log_likehood(new_samples)}")

In [None]:
#Scatter original data and generated one
fig, axis = plt.subplots(1, 2, figsize=(15,3))

axis[0].set_title("Original data")
axis[1].set_title("Generated data")

sns.scatterplot(data=davis_df, x='weight', y='height', hue='sex', ax=axis[0])
sns.scatterplot(data=new_samples, x='weight', y='height', hue='sex', ax=axis[1])

In [None]:
#Gradient descent
y = davis_train['weight']
x = davis_train['height'].to_numpy()

w = 1
b = 0
alpha = 0.0001
old_w = []

max_norm = 5
for t in range(10000):
    l = ((1/len(x)) * np.sum((y - w * x - b)**2))
    d_yh = (w * x + b - y)
    
    d_w = np.mean(d_yh * x)
    d_b = np.mean(d_yh)
    
    norm = np.sqrt(d_w**2 + d_b**2)

    if norm > max_norm:
        reciprocal = norm / max_norm
        d_w /= reciprocal
        d_b /= reciprocal
    w = w - alpha * d_w
    b = b - alpha * d_b
    
    if t % 500 == 0:
        old_w.append(np.array((w, b)))
        
print(w, b)

In [None]:
#Normalize data before gradient descent
y_norm = (davis_train.weight - davis_train.weight.mean()) / davis_train.weight.std()
x_norm = (davis_train.height - davis_train.height.mean()) / davis_train.height.std()

w_norm = 1
b_norm = 0
old_w_norm = []

max_norm = 5
for t in range(10000):
    l_norm = ((1/len(x_norm)) * np.sum((y_norm - w_norm * x_norm - b_norm)**2))
    d_yh = (w_norm * x_norm + b_norm - y_norm)
    
    d_w = np.mean(d_yh * x_norm)
    d_b = np.mean(d_yh)
    
    norm = np.sqrt(d_w**2 + d_b**2)

    if norm > max_norm:
        reciprocal = norm / max_norm
        d_w /= reciprocal
        d_b /= reciprocal
    w_norm = w_norm - alpha * d_w
    b_norm = b_norm - alpha * d_b
    
    if t % 500 == 0:
        old_w_norm.append(np.array((w_norm, b_norm)))
        
w_norm = w_norm * davis_train.weight.std() / davis_train.height.std()
b_norm = davis_train.weight.mean() + b_norm * davis_train.weight.std() - w_norm * davis_train.height.mean()

print(w_norm, b_norm)

In [None]:
# Compare gradient descent with linear regression
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()

x_train = davis_train[["height"]].values.astype(np.float)
y_train = davis_train["weight"].values.astype(np.float)

lreg.fit(x_train, y_train)

x_to_pred = np.linspace(x_train.min(), x_train.max(),30).reshape(-1,1)
lreg_pred = lreg.predict(x_to_pred)
grad_pred = w * x_to_pred + b
grad_pred_norm = w_norm * x_to_pred + b_norm

plt.scatter(davis_train["height"], davis_train["weight"])
plt.plot(x_to_pred[:,0], lreg_pred,label="Linear regression")
plt.plot(x_to_pred[:,0], grad_pred, label="Gradient descent")
plt.plot(x_to_pred[:,0], grad_pred_norm, label="Gradient descent normalized")
plt.xlabel("Height")
plt.ylabel("Weight")
plt.legend()
plt.show()

In [None]:
#Plotting gradient descent path
all_ws = np.array(old_w)
ww,bb = np.mgrid[0:1.2:100j, -0.25:0.05:100j]
L = []
heights = davis_train['height'].values.astype(np.float64)
weights = davis_train['weight'].values.astype(np.float64)
for w, b in zip(ww.ravel(),bb.ravel()):
    L.append(1/len(heights) * np.sum((weights - w * heights - b)**2))
L = np.array(L)
plt.figure(figsize=(10, 8))
plt.title("Contour Plot of Gradient Descent")
plt.xlabel("w")
plt.ylabel("b")
plt.contourf(ww,bb,L.reshape(ww.shape),levels=50)

for i in range(len(old_w) - 1):
    plt.annotate('', xy=all_ws[i + 1, :], xytext=all_ws[i, :],
                 arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},
                 va='center', ha='center')