In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Importing Libraries

In [None]:
import pandas as pd 
pd.set_option('display.max_rows',None)

import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

#  Importing the data

In [None]:
data = pd.read_csv('../input/headbrain/headbrain.csv')
data.head()

# Segregating variables: Independent and Dependent variables

In [None]:
x = data['Head Size(cm^3)']
y = data['Brain Weight(grams)']
n = len(y)

# Splitting the data into train set and test set

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x,y, test_size=0.3)
train_x.shape , train_y.shape

# Implementing Linear Regression -Manual

In [None]:
plt.scatter(train_x, train_y)
plt.xlabel('Head Size(cm^3)')
plt.ylabel('Brain Weight(grams)')
plt.show()

# Calculating m and c manually

In [None]:
mean_x = np.mean(train_x)
mean_y = np.mean(train_y)
num = 0
denom = 0

# for i in range(n):
#     num = num + (x[i] - mean_x)*(y[i] - mean_y)
#     denom = denom + ((x[i] - mean_x))**2
    
num = np.dot(np.subtract(train_x,mean_x), np.subtract(train_y,mean_y))
denom = np.dot(np.subtract(train_x,mean_x), np.subtract(train_x,mean_x))

m = num/denom
c = mean_y - (m*mean_x)
print(m,c)

# Creating dummy dataset

In [None]:
min_x = np.min(train_x)-100
max_x = np.max(train_x)+100
x_dummy = np.linspace(min_x,max_x,1000)
y_dummy = m * x_dummy + c

plt.scatter(train_x,train_y,color='g')
plt.plot(x_dummy,y_dummy,color='r')
plt.title('Simple Linear Regression')
plt.xlabel('Head size cm^3')
plt.ylabel('Brain weight in grams')

# Calculating R Square

In [None]:
sum_pred = 0
sum_act = 0

for xi,yi in zip(train_x, train_y):
    y_pred = (m * xi + c)
    sum_pred += (y_pred - mean_y)**2
    sum_act += (yi - mean_y)**2

# r2 = 1-(sum_pred/sum_act)
r2 = sum_pred/sum_act
print(r2)

# Here we can observe that we got R**2> 0.5 . so we have good model

In [None]:
def predict(x):
    return m*x+c

print(predict(4177))

# **Implementing Linear Regression - Library**

**Segregating variables: Independent and Dependent variables**

In [None]:
x = data['Head Size(cm^3)'].values
y = data['Brain Weight(grams)'].values
n = len(y)

x = x.reshape((len(x),1)) # Converting into 2d array
y = y.reshape((len(y),1))

train_x, test_x, train_y, test_y = train_test_split(x,y, test_size=0.3)

train_x.shape , train_y.shape

# Building model for training dataset

In [None]:
from sklearn import linear_model

reg = linear_model.LinearRegression(normalize=True)
reg.fit(train_x, train_y) # accepts 2d array


# **Predicting for testing dataset**

In [None]:
y_predict = reg.predict(test_x)

# Comparing predicted and guessed value

In [None]:
df = pd.DataFrame({'Actual': test_y.flatten(), 'Predicted': y_predict.flatten()})
df.head()

**Visualizing test dataset**

In [None]:
plt.scatter(test_x,test_y,color='g')
plt.plot(test_x,y_predict,color='r')
plt.title('Simple Linear Regression')
plt.xlabel('Head size cm^3')
plt.ylabel('Brain weight in grams')

# Calculating Mean Absolute Error

In [None]:
from sklearn.metrics import mean_absolute_error

accuracy2 = mean_absolute_error(test_y, y_predict)
accuracy2

# Calculating R Square

In [None]:
from sklearn.metrics import r2_score

accuracy = r2_score(test_y, y_predict)
print(accuracy)

weights = reg.coef_
intercept = reg.intercept_
print(weights, intercept)

# **Calculating Linear Regression Using Gradient Descent**

**Collecting Data**

In [None]:
x = data['Head Size(cm^3)']
y = data['Brain Weight(grams)']
x.shape, y.shape

In [None]:
def gradient_descent(x, y, m, c, alpha, iterations, n):

    # Performing Gradient Descent 
    for i in range(iterations): 
        y_guess = m*x + c  # The current predicted value of Y
        cost = 1/n * np.sum((y - y_guess)**2) # Cost function to check convergence of theta
        D_m = (-2/n) * np.sum(x * (y- y_guess))  # Derivative wrt m
        D_c = (-2/n) * np.sum(y - y_guess)  # Derivative wrt c
        m = m - alpha * D_m  # Update m
        c = c - alpha * D_c  # Update c
        costs.append(cost)
    return m,c, costs

In [None]:
n = len(x)
m = 0
c = 0
costs = []
alpha = 0.000000009 # The learning Rate
iterations = 30 # The number of iterations to perform gradient descent

m,c, costs = gradient_descent(x, y, m, c, alpha, iterations, n)

# Debugging Theta

In [None]:
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('iterations (per hundreds)')
plt.title('Cost reduction over time')
plt.show()

**It shows, that we have learning rate neighter too small nor too large, and after some iterations, m and c are constant.**

# Visualizing the Output

In [None]:

y_guess = m*x+c

plt.scatter(x,y)
plt.xlabel('Head Size(cm^3)')
plt.ylabel('Brain Weight(grams)')
plt.plot([min(x), max(x)], [min(y), max(y)], color='red')
plt.show()

# **Calculating R Square**

In [None]:
from sklearn.metrics import r2_score

accuracy = r2_score(y, y_guess)
print(accuracy)

R Square > 0.5, thus we can go with our model

# Predicting Output

In [None]:
def predict(x_):
    return m*x_+c

In [None]:
print(predict(4747))

In [None]:
data[data['Head Size(cm^3)'] == 4747]