In [None]:
# Here we import our data and view it
%matplotlib notebook
import pandas as pd
df = pd.read_csv('brainhead.csv')
print(df)

In [None]:
# Now we'll extract the 'head size' and 'brain weight' data to get our x and y values

import numpy as np

# We have to extract the data from the columns we're interested in.
# The regression method we'll be calling uses data in a weird format. 
# This is why x is formatted as a column vector and y as a row vector.
# The reason for this format is that later in place of single x values we will be dealing with arrays of x values.

x = np.array(df[['Head size (cm^3)']].values)
y = np.array(df[['Brain weight (g)']].values.flatten())

print(x)
print(y)                 



In [None]:
# Now we can calculate the parameters of the regression line a + bx for x and y

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x,y)

a = model.intercept_
b = model.coef_

print('intercept:', a)
print('slope:', b)

In [None]:
# We can check r^2 and s_yx
import math

# r^2 is easy as sklearn has a built-in method to find it
r2 = round(model.score(x, y),3)
print('coefficient of determination:', r2)

# There's no built-in method for s_yx as far as I'm aware, but we can easily write our own.

def findSE(x_values, y_values, a, b, sf=3):
    Sr = 0
    for i in range(0, len(x_values)):
        y = y_values[i]
        x = x_values[i]
        Sr += (y - a - b*x)**2
    return round(math.sqrt(Sr/(len(y_values) - 2)),sf)

s_yx = findSE(x,y,a,b)
print('standard error of estimate:', s_yx)
    

In [None]:
# We can plot a graph. 

import matplotlib.pyplot as plt

fig, ax = plt.subplots() 
ax.scatter(x, y, c = 'orange')
x_line = np.linspace(2700,4800,100)
y_line = a + b*x_line
plt.plot(x_line, y_line)
ax.set_ylabel('Brain weight')
ax.set_xlabel('Head size')
plt.show()

In [None]:
# Our data is divided into male/female and into two categories based on age
# Sometimes when we perform a regression analysis we can get a trend which 
# does not exist or is even reversed if we break the data down into subcategories.
# You can google 'Simpson's paradox' to get more information about this.
# Here we perform our regression analysis seperately for men and women.

temp_x = np.array(df[['Gender (1 = M, 2 = F)','Head size (cm^3)']].values) 
M_x = []
F_x = []
for i in temp_x:
    if i[0] == 1:
        M_x.append([i[1]])
    else:
        F_x.append([i[1]])
        
temp_y = np.array(df[['Gender (1 = M, 2 = F)','Brain weight (g)']].values) 
M_y = []
F_y = []
for i in temp_y:
    if i[0] == 1:
        M_y.append(i[1])
    else:
        F_y.append(i[1])
#print(M_x)
#print(F_x)
#print(M_y)
#print(F_y)
model_M = LinearRegression()
model_M.fit(M_x,M_y)

a_M = model_M.intercept_
b_M = model_M.coef_

print('intercept for men:', a_M)
print('slope for men:', b_M)

r2_M = round(model_M.score(M_x, M_y),3)
print('coefficient of determination for men:', r2_M)

model_F = LinearRegression()
model_F.fit(F_x,F_y)

a_F = model_F.intercept_
b_F = model_F.coef_

print('intercept for women:', a_F)
print('slope for women:', b_F)

r2_F = round(model_F.score(F_x, F_y),3)
print('coefficient of determination for women:', r2_F)

In [None]:
# Notice how the r^2 values for men and women are both smaller than the 
# r^2 value calculated for the combined data. 
# Now we draw graphs for men and women seperately.

fig_M, ax_M = plt.subplots() 
ax_M.scatter(M_x, M_y, c = 'orange')
x_line = np.linspace(2700,4800,100)
y_line = a + b*x_line
plt.plot(x_line, y_line)
ax_M.set_ylabel('Brain weight')
ax_M.set_xlabel('Head size')
ax_M.set_title('Men only')
plt.show()


fig_F, ax_F = plt.subplots() 
ax_F.scatter(F_x, F_y, c = 'orange')
x_line = np.linspace(2700,4800,100)
y_line = a + b*x_line
plt.plot(x_line, y_line)
ax_F.set_ylabel('Brain weight')
ax_F.set_xlabel('Head size')
ax_F.set_title('Women only')
plt.show()

In [None]:
# You can try to write a method that calculates the coefficients a,b
# for the regression line a + bx. If you do this correctly, this cell should produce as output 
# the plot for the regression line for the full dataset (as in the output of cell 5).

# You need to fill in the details here.
# This method should take an array of x values and an array of y values,
# and return the coefficients a,b of the regression line.
# You only need to guarantee correct behaviour for correctly formatted data.
def LR(x_values, y_values):
    #TODO
    return 

x1 = np.array(df[['Head size (cm^3)']].values.flatten())
y1 = np.array(df[['Brain weight (g)']].values.flatten())

a1, b1 = LR(x1,y1)
fig1, ax1 = plt.subplots() 
ax1.scatter(x1, y1, c = 'orange')
x_line1 = np.linspace(2700,4800,100)
y_line1 = a1 + b1*x_line1
plt.plot(x_line1, y_line1)
ax1.set_ylabel('Brain weight')
ax1.set_xlabel('Head size')
plt.show()