## Handling The Data

In [1]:
#Importing the packages needed
from random import seed
from random import randrange
from csv import reader
from math import sqrt

In [2]:
# Function to Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            dataset.append(row)
    return dataset

In [3]:
# #testing the function to load the csv file
# filename = 'insurance.csv'
# dataset = load_csv(filename)

In [4]:
# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

## Train Test Split

In [5]:
# Split a dataset into a train and test set - here we have 60% of data in training set and 40% in testing set
def train_test_split(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

## 1. Calculate Mean And Variance

In [6]:
# Estimate Mean and Variance
 
# Calculate the mean value of a list of numbers
def mean(values):
    return sum(values) / float(len(values))
 
# Calculate the variance of a list of numbers
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

In [8]:
# # calculate mean and variance
# dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]

In [9]:
# dataset

In [10]:
# x = [row[0] for row in dataset]
# y = [row[1] for row in dataset]

In [11]:
# mean_x, mean_y = mean(x), mean(y)
# var_x, var_y = variance(x, mean_x), variance(y, mean_y)
# print('x stats: mean=%.3f variance=%.3f' % (mean_x, var_x))
# print('y stats: mean=%.3f variance=%.3f' % (mean_y, var_y))

In [12]:
# x

## 2. Calculate Covariance

In [13]:
# Calculate the mean value of a list of numbers
def mean(values):
    return sum(values) / float(len(values))
 
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

In [14]:
# # calculate covariance
# dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
# x = [row[0] for row in dataset]
# y = [row[1] for row in dataset]

In [17]:
# print(x)
# print(y)

In [18]:
# mean_x, mean_y = mean(x), mean(y)
# covar = covariance(x, mean_x, y, mean_y)
# print('Covariance: %.3f' % (covar))

## 3.Estimate Coefficients

In [19]:
# Calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

In [20]:
#  # calculate coefficients
# dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
# b0, b1 = coefficients(dataset)
# print('Coefficients: B0=%.3f, B1=%.3f' % (b0, b1))

## 4.Make Predictions - Model Evaluation

In [21]:
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions

In [28]:
# Calculate root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

# Evaluate regression algorithm on training dataset usig train test split-- will get the predictions and actual values and will send it to rmse function
def evaluate_algorithm(dataset, algorithm, split, *args):
    train, test = train_test_split(dataset, split)
    test_set = list()
    for row in test:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(train, test_set, *args)
    actual = [row[-1] for row in test]
    rmse = rmse_metric(actual, predicted)
    return rmse

In [23]:
# # Test simple linear regression
# dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
# rmse = evaluate_algorithm(dataset, simple_linear_regression)
# print('RMSE: %.3f' % (rmse))

In [24]:
# #testing the above flow
# test_set = list()
# print(test_set)
# for row in dataset:
#         row_copy = list(row)
#         row_copy[-1] = None
#         test_set.append(row_copy)
# print(test_set)

In [25]:
# #testing the above flow
# dataset

In [29]:
#Running the Linaer Regression Model
seed(1)
# load and prepare data
filename = 'insurance.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
# evaluate algorithm
split = 0.6
rmse = evaluate_algorithm(dataset, simple_linear_regression, split)
print('RMSE: %.3f' % (rmse))

RMSE: 33.630
