# Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

# Create a random linear function

In [2]:
def create_random_linear_function(feature_size = 2, sample_size = 700):
    theta = np.random.randint(-5, 5, feature_size+1)
    X = np.random.randint(1, 10, sample_size*feature_size).reshape((sample_size, feature_size))
    
    df = pd.DataFrame(X, columns = ['x{}'.format(i+1) for i in range(feature_size)])
    features = [None]*(feature_size+1)
    features[1:] = df.columns
    features[0] = 'ones'
    df['ones'] = 1
    
    df = df[features]
    df['y'] = np.dot(df, theta)
    del df['ones']
    
    return theta, df

# Write the dataframe and paramters onto a CSV file

In [3]:
theta, df = create_random_linear_function()
df.to_csv('linear_file.csv', sep=',', index=False)
with open('linear_file_parameters.pkl', 'wb') as f:
    pickle.dump(theta, f)

# Read the csv and extract features

In [4]:
df = pd.read_csv('linear_file.csv')
n = df.columns.size
df['ones'] = 1
features = [None]*n
features[0] = 'ones'
features[1:] = df.columns[:n-1]

# Create the cost function and gradient

In [5]:
def cost(X, Y, theta):
    return np.sum(np.power(np.dot(X, theta) - Y, 2))

def gradient(X, Y, theta):
    return np.dot(X.T, np.dot(X, theta) - Y)

# Create gradient descent

In [6]:
def gradient_descent(X, Y, alpha = 0.03, iterations=10000, epsilon=1e-7, cost_function = cost, gradient_function = gradient):
    m, n = X.shape
    theta = np.random.randn(n)
    converged = False
    J = [0]
    
    count = 1
    while count < iterations+1:
        J.append(cost_function(X, Y, theta))
        if(abs(J[count]-J[count-1]) <= epsilon):
            converged = True
            break
        grad = gradient_function(X, Y, theta)
        theta = theta - alpha * 1/m * grad
        count += 1
        
    return (converged, count-1, J, theta)


# Divide data into training and testing

In [7]:
def train_test(data, split=(6, 1)):
    a, b = split
    m = data.shape[0]
    train_size = m*a//(a+b)
    data_train = data[:train_size]
    data_test = data[train_size:]
    return data_train, data_test

# Run gradient descent on the data

In [12]:
X_train, X_test = train_test(df[features])
Y_train, Y_test = train_test(df['y'])

converged, iterations, J, theta = gradient_descent(X_train, Y_train)
result = "Did not converge in {} iterations".format(iterations)
if converged:
    result = "Converged in {} iterations".format(iterations)
#theta = [float('{0:.2f}'.format(t)) for t in theta]

theta_file = None
with open('linear_file_parameters.pkl', 'rb') as f:
    theta_file = pickle.load(f)
print(result)
print('Predicted theta:', theta)
print('Actual theta   :', theta_file)

Converged in 2747 iterations
Predicted theta: [-3.99955663  1.99996012  0.99996135]
Actual theta   : [-4  2  1]


# Compare predicted and actual data

In [13]:
Y_pred = np.dot(X_test, theta)
compare_df = pd.DataFrame(list(zip(Y_pred, Y_test)), columns= ['Predicted Y', 'Actual Y'])
mse = np.sum(np.power(Y_pred-Y_test, 2))/Y_pred.size
print('Mean squared error: ', mse)
compare_df.head()

Mean squared error:  2.5424207153184955e-08


Unnamed: 0,Predicted Y,Actual Y
0,17.99993,18
1,1.000288,1
2,13.999972,14
3,9.000091,9
4,15.000008,15
