In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import random
import math
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')

In [None]:
df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/merc.csv')

In [None]:
df.head()

In [None]:
df.fuelType.value_counts()

# What are Linear Least Squares?

1. Linear least square is a method that is used to estimate the slope of the straight line that indicates the relationship between 2 variables.
2. It is similar to finding correlation. However, in correlation, we measure the strength and the sign of the relationship not the slope that we do with the help of this method.
3. Using least squares; we first estimate the intercept and the slope of the correlation line using the mean of both the variables. We use the formula; "***mean(y) = mean(x) * slope + inter***".
4. We then fit the sorted values of the variable 'x' in the formula to find the corresponding values of 'y'. The 'FitLine' function does the job.

In [None]:
petrol = df[df.fuelType == 'Petrol']

In [None]:
def LeastSquares(xs, ys):
    meanx, varx = np.mean(xs), np.var(xs)
    meany = np.mean(ys)
    
    slope = Cov(xs, ys, meanx, meany) / varx
    inter = meany - slope * meanx
    
    return inter, slope

## Cov denotes covariance
def Cov(xs, ys, meanx=None, meany=None):
    xs = np.asarray(xs)
    ys = np.asarray(ys)
    
    if meanx is None:
        meanx = np.mean(xs)
    if meany is None:
        meany = np.mean(ys)
    
    cov = np.dot(xs - meanx, ys - meany) / len(xs)
    return cov

In [None]:
def FitLine(xs, inter, slope):
    fit_xs = np.sort(xs)
    fit_ys = inter + slope * fit_xs
    return fit_xs, fit_ys

In [None]:
df.info()

In [None]:
price = petrol.price
mileage = petrol.mileage

In [None]:
inter, slope = LeastSquares(mileage, price)
fit_xs, fit_ys = FitLine(mileage, inter, slope)

In [None]:
print('inter is {} and slope is {}'.format(inter, slope))

In [None]:
plt.figure(figsize = (15, 8))

plt.xlabel('Mileage')
plt.ylabel('Price')
plt.plot(fit_xs, fit_ys, color = 'black', linewidth = 2)
plt.scatter(mileage, price, color = 'green', s = 10)

# What are residuals?

1. After looking at the graph above, it would be right to say that most of the points either lie above the line or below it. And it was supposed to be so. Why? Because we didn't use the values of the vaiable 'ys' but estimated some 'fit_ys' to fit the line with the original values of 'xs'.

***Why do we do that?***

1. Because with the original values, a straight line could not be constructed. The purpose of this line is to estimate the correlation between the 2 variables. 

That said, if we were to connect all those points, the structure would be anything but a straight line. Those points that are not exactly on the line (the green points in the above figure), are called residuals.

***Why are residuals important?***

1. Residuals give an overall picture of how good is our line.
2. The Root mean square error (RMSE) if we use mileage to predict the price is known to us by the virtue of the standard deviation of the residuals.
3. If the standard deviation of residuals turns out to be less than the standard deviation of the 'ys' variable, we say that using 'xs' as a feature to predict the values of 'ys' does not makes a difference. And that, we shouldn't use the feature.
4. Lastly, one of the primary concern of the regression analysis is to ***reduce the value of "sum(res ** 2)"***.

In [None]:
def Residuals(xs, ys, inter, slope):
    xs = np.asarray(xs)
    ys = np.asarray(ys)
    res = ys - (inter + slope * xs)
    return res

In [None]:
res = Residuals(mileage, price, inter, slope)
petrol['residual'] = res

In [None]:
sum(res ** 2)

In [None]:
petrol.residual.describe()

In [None]:
petrol.price.describe()

In [None]:
print('RMSE if we use mileage to predict the price: {}'.format(np.std(petrol.residual)))
print('RMSE if we do not use mileage to predict the price: {}'.format(np.std(petrol.price)))
print('\n')
print('difference in both RMSEs: {}'.format(np.std(petrol.price)-np.std(petrol.residual)))

***What does the output tell us?***

The RMSE is lesser when we use 'mileage' to predict the price of a vehicle than when we do not. The difference is much bigger. Therefore, we will use mileage to predict the prices.

# Goodness of a fit:

1. Now that we have plotted a line by fitting the values of mileage and price, we need to find out how good our line is in predicting the price using the mileage.
2. To demonstrate the goodness of our fit, we will use the "coefficient of determination" or 'rho square'.
3. The formula for 'rho square' is demonstrated in the code cell below.

In [None]:
r_squared = 1 - (np.var(petrol.residual) / np.var(petrol.price))
print(r_squared)

***What does the output suggest?***

The coefficeint of determination is almost 0.17 which suggests that mileage predicts almost 17% of the variance in prices.