## Importing neccessary libraries

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

## Linear Regression Implementation

In [52]:
# Initialises the weight matrix and bias value
# Weight matrix will be of the shape (n,1) where 'n' is the number of features

def initialize_weights(dim):
    W = np.zeros((dim, 1))
    b = 0
    
    return W, b


In [53]:
# Implements forward propogation

# 'n' - Number of features
# 'm' - Number of samples/records

# W - Weight matrix (n,1)
# b - Bias value
# X - Input matrix - (n,m)
# Y_act - Actual target values - (1,m)
# Y_pred - Predicted target value - (1,m)

def propogate(W, b, X, Y_act):
    
    m = X.shape[1]
    
    # Forward propogation - Predicting the target value
    Y_pred = np.dot(W.T, X) + b

    # Calculating the cost function - Ordinary Least Square
    cost = (1/m)*np.sum((Y_act - Y_pred)*(Y_act - Y_pred))    
    
    # Backward propogation - Calculating the gradient
    w_grad = (-2/m)*np.matmul(X, (Y_act - Y_pred).T)
    b_grad = (-2/m)*np.sum(Y_act - Y_pred)
    
    grads = {'w_grad': w_grad, 'b_grad': b_grad}
    
    return grads, cost

In [54]:
# Gradient Descent

# 'n' - Number of features
# 'm' - Number of samples/records

# X - (n,m)
# Y - (1,m)

def optimize(X, Y, num_iterations, learning_rate, print_cost=False):
    
    n = X.shape[0]
    
    # Initializing the weight and cost matrix
    W, b = initialize_weights(n)
    costs = []
    
    for i in range(num_iterations):
        
        # Computing the gradient and cost function for ith iteration
        (grads, cost) = propogate (W, b, X, Y)
        
        w_grad = grads['w_grad']
        b_grad = grads['b_grad']
        
        # Updating the Weight and bias
        W = W - learning_rate * w_grad
        b = b - learning_rate * b_grad
        
        
        if i % 100 ==0 :
            costs.append(cost)
            
        if print_cost and i % 100 == 0:
            print("Cost after iteration {}: {}".format(i, cost))
        
        params = {"w": W, "b": b}
        
        grads = {'w_grad': w_grad, 'b_grad': b_grad}
    
    return params, grads, cost, costs

### Simple Linear Regression using the implemented model for a dummy dataset

In [55]:
df = pd.read_csv("../input/random-linear-regression/train.csv")
print("Shape of the dataframe: {}".format(df.shape))
print("Dataframe Info:")
print(df.info())

In [56]:
df = df.loc[~df['y'].isnull(), :]
print("Shape of the dataframe: {}".format(df.shape))
df.isnull().sum()

In [57]:
sl_params, sl_grads, sl_cost, sl_costs = optimize(df['x'].values.reshape((1, df['x'].shape[0])), df['y'].values.reshape((1, df['y'].shape[0])), 1000, 0.000001, True)

In [58]:
test_df = pd.read_csv("../input/random-linear-regression/test.csv")
print("Shape of the dataframe: {}".format(test_df.shape))
print("Dataframe Info:")
print(test_df.info())

In [59]:
a_1 = sl_params['w'][0][0]
a_0 = sl_params['b']
x_test = test_df['x']
y_test = test_df['y']

y_prediction = a_0 + a_1 * x_test
print('R2 Score:',r2_score(y_test,y_prediction))

y_plot = []
for i in range(100):
    y_plot.append(a_0 + a_1 * i)
plt.figure(figsize=(10,10))
plt.scatter(x_test,y_test,color='red',label='GT')
plt.plot(range(len(y_plot)),y_plot,color='black',label = 'pred')
plt.legend()
plt.show()

## Multiple Linear Regression using Car dataset

### Reading the dataset

In [60]:
cars = pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")
print("Shape of the dataframe: {}".format(cars.shape))
print("Dataframe Info:")
print(cars.info())

### Data Cleaning

#### Removing 'car_ID' field
Since the 'car_ID' field is unique, it will not have impact on the price prediction. Hence removing the field

In [61]:
print("Number of unique car_ID's: {}".format(cars['car_ID'].nunique()))
cars.drop(columns=['car_ID'], inplace=True)
print("Shape (after removing 'car_ID'): {}".format(cars.shape))

#### Null value check

In [62]:
cars.isnull().sum()

#### Datatype conversion

In [63]:
# Listing the fields having 'object' datatype
cars.select_dtypes(include='object').columns

In [64]:
# Printing the categorical values for each of the above field
for field in cars.select_dtypes(include='object').columns:
    print(cars[field].value_counts())
    print("-"*100)

From the above list, the fields '**doornumber**' and '**cylindernumber**' can be converted to integer values.

In [65]:
cars['doornumber'] = cars['doornumber'].map({'four': 4, 'two': 2})
print("After converting to integer")
print(cars['doornumber'].value_counts())
print("-"*50)
cars['cylindernumber'] = cars['cylindernumber'].map({'four': 4, 'six':6, 'five':5, 'eight':8, 'three':3, 'twelve':12, 'two': 2})
print("After converting to integer")
print(cars['cylindernumber'].value_counts())

'**drivewheel**' has two values ('**4wd**' and '**fwd**') representing 'forward drive'.

In [66]:
#  Making 'drivewheel' values consistent
cars.loc[cars['drivewheel'] == '4wd', 'drivewheel'] = 'fwd'
cars['drivewheel'].value_counts()

Extracting the '**carBrand**' info from the '**CarName**'

In [67]:
cars.loc[:, 'carBrand'] = cars['CarName'].map(lambda x: x.lower().split(" ")[0])
cars['carBrand'].value_counts().sort_index()

In [68]:
cars.loc[:, 'carBrand'] = cars['carBrand'].str.replace('maxda', 'mazda', regex=False)
cars.loc[:, 'carBrand'] = cars['carBrand'].str.replace('porcshce', 'porsche', regex=False)
cars.loc[:, 'carBrand'] = cars['carBrand'].str.replace('toyouta', 'toyota', regex=False)
cars.loc[:, 'carBrand'] = cars['carBrand'].str.replace('vokswagen|vw', 'volkswagen', regex=True)
cars['carBrand'].value_counts().sort_index()

In [69]:
cars['CarName'].value_counts()[cars['CarName'].value_counts() > 1]

In [70]:
cars.drop(columns=['CarName'], inplace=True)
cars.shape

### Target variable analysis

#### Checking for outliers

In [71]:
plt.figure(figsize=(12,6))
plt.boxplot(cars['price'], vert=False)
plt.title('price')
plt.show()

#### Removing outliers

In [72]:
cars = cars[cars['price'] < cars['price'].quantile(.90)]
cars.shape

In [73]:
plt.figure(figsize=(12,6))
plt.boxplot(cars['price'], vert=False)
plt.title('price')
plt.show()

### Univariate Analysis

In [74]:
cars.nunique()

#### Removing 'enginelocation'

In [75]:
# Since there is only one distinct value for 'eningelocation' (which is 'front'), this will not have any relevance in predicting the car price and hence this feature can be removed
print(cars['enginelocation'].value_counts())
cars.drop(columns=['enginelocation'], inplace=True)
cars.shape

#### Box plot

In [76]:
# Plotting the box plot for numerical fields
plt.figure(figsize=(24,24))
num_cols = cars.select_dtypes(exclude='object').columns
for i in range(len(num_cols)):
    plt.subplot(math.ceil(len(num_cols)/3), 3, i+1)
    plt.boxplot(cars[num_cols[i]])
    plt.title(num_cols[i])

#### Co-relation

In [77]:
# Displaying the correlation
plt.figure(figsize=(24,12))
sns.heatmap(cars.corr(), annot=True)
plt.show()

#### Removing co-related fields

In [78]:
# Removing 'highwaympg'
cars.drop(columns=['highwaympg'],inplace=True)
cars.shape

### Data Preparation

#### One hot encoding the categorical features

In [79]:
# Creating one hot encodings for categorical values
cars = pd.get_dummies(cars, drop_first=True)
cars.shape

In [80]:
# Printing the details
cars.info()

In [81]:
# Checking for null values
cars.isnull().sum()[cars.isnull().sum()>0]

#### Train test split

In [82]:
y = cars.pop('price')
X = cars

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 42)
print("Train shape: ")
print("X: {}, y: {}".format(X_train.shape, y_train.shape))
print("-"*30)
print("Test shape: ")
print("X: {}, y: {}".format(X_test.shape, y_test.shape))

#### Standardisation

In [84]:
# Standardizing the features

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Input Data

In [85]:
print("Shape of the input dataframe: {}".format(X_train.shape))
print("-"*100)
print("Dataframe details:")
print(X_train.info())

In [86]:
X_train.head()

#### Output data

In [87]:
print("Shape of the output: {}".format(y_train.shape))

In [88]:
y_train.head()

### Creating a Simple Linear Model using the most dependent features

In the below section, we try to use our implementation of Linear Regression model to predict the target variable ('price') using just one independent variable. i.e we assume the target variable is dependent only on a single dependent variable in each of these subsectoins.
This is done solely for the purpose of plotting the predicted value against the actual value and see the line predicted by our implementation and how it fits the actual data

### 'price' as a function of 'curbweight'

In [89]:
cb_params, cb_grads, cb_cost, cb_costs = optimize(X_train['curbweight'].values.reshape((1, X_train['curbweight'].shape[0])),y_train.values.reshape((1, y_train.shape[0])), 500, 0.0000001, True)

In [90]:
y_train_pred = np.dot(cb_params['w'].T, X_train['curbweight'].values.reshape((1, X_train['curbweight'].shape[0]))) + cb_params['b']
y_train_pred.shape

In [91]:
plt.figure(figsize=(10,10))
plt.scatter(X_train['curbweight'], y_train, color='red', label='curbwieght')
plt.plot(X_train['curbweight'], y_train_pred.T, color='blue', label='pred')
plt.xlabel('curbweight')
plt.ylabel('price')
plt.show()

### 'price' as a function of 'horsepower'

In [92]:
hp_params, hp_grads, hp_cost, hp_costs = optimize(X_train['horsepower'].values.reshape((1, X_train['horsepower'].shape[0])),y_train.values.reshape((1, y_train.shape[0])), 500, 0.00001, True)

In [93]:
y_train_pred = np.dot(hp_params['w'].T, X_train['horsepower'].values.reshape((1, X_train['horsepower'].shape[0]))) + hp_params['b']
y_train_pred.shape

In [94]:
plt.figure(figsize=(10,10))
plt.scatter(X_train['horsepower'], y_train, color='red', label='horsepower')
plt.plot(X_train['horsepower'], y_train_pred.T, color='blue', label='pred')
plt.xlabel('horsepower')
plt.ylabel('price')
plt.show()

### 'price' as a function of 'enginesize'

In [95]:
es_params, es_grads, es_cost, es_costs = optimize(X_train['enginesize'].values.reshape((1, X_train['enginesize'].shape[0])),y_train.values.reshape((1, y_train.shape[0])), 500, 0.00001, True)

In [96]:
y_train_pred = np.dot(es_params['w'].T, X_train['enginesize'].values.reshape((1, X_train['enginesize'].shape[0]))) + es_params['b']
y_train_pred.shape

In [97]:

plt.figure(figsize=(10,10))
plt.scatter(X_train['enginesize'], y_train, color='red', label='enginesize')
plt.plot(X_train['enginesize'], y_train_pred.T, color='blue', label='pred')
plt.xlabel('enginesize')
plt.ylabel('price')
plt.show()

### Creating the final model with all the independent variables

In [98]:
params, grads, cost, costs = optimize(X_train_scaled.T,y_train.values.reshape((1, y_train.shape[0])), 2000, 0.001, True)

### Plotting the cost against iteration 

In [99]:
plt.figure(figsize=(12,6))
plt.plot(costs)
plt.xlabel('cost')
plt.ylabel('iteration')
plt.show()