## Importing libraries and data

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('UNI.csv')
test = pd.read_csv('unitest.csv')

In [3]:
set(train.columns).difference(set(test.columns))

{'current price', 'km ', 'top speed'}

## Checking the names of the columns in the training and test dataset as there seems to be difference in the names of the columns

In [4]:
list(train.columns)

['v.id',
 'on road old',
 'on road now',
 'years',
 'km ',
 'rating',
 'condition',
 'economy',
 'top speed',
 'hp',
 'torque',
 'current price']

In [5]:
list(test.columns)

['v.id',
 'on road old',
 'on road now',
 'years',
 'km',
 'rating',
 'condition',
 'economy',
 'top speed ',
 'hp',
 'torque']

## Renaming the column names which are different in training and testing dataset

In [7]:
train.rename(columns={'km ':'km'}, inplace=True)
test.rename(columns={'top speed ':'top speed'}, inplace=True)

In [8]:
set(train.columns).difference(set(test.columns))

{'current price'}

In [9]:
test_id = test['v.id']

In [10]:
test.drop('v.id', axis=1, inplace=True)

In [11]:
train.drop('v.id', axis=1, inplace=True)

In [12]:
X = train.drop('current price', axis=1)
y = train['current price']

In [15]:
#Splitting the training dataset in train and test set to crossvalidate the model's accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Using GridSearchCV for hyperparameter tuning of LinearRegression, Lasso, Ridge regression

In [22]:
lasso_params = {'fit__alpha':[0.005, 0.02, 0.03, 0.05, 0.06]}
ridge_params = {'fit__alpha':[550, 580, 600, 620, 650]}

pipe1 = Pipeline([('poly', PolynomialFeatures()),
                 ('fit', linear_model.LinearRegression())])
pipe2 = Pipeline([('poly', PolynomialFeatures()),
                 ('fit', linear_model.Lasso())])
pipe3 = Pipeline([('poly', PolynomialFeatures()),
                 ('fit', linear_model.Ridge())])

models3 = {'OLS': pipe1,
           'Lasso': GridSearchCV(pipe2, 
                                 param_grid=lasso_params).fit(X, y).best_estimator_ ,
           'Ridge': GridSearchCV(pipe3, 
                                 param_grid=ridge_params).fit(X, y).best_estimator_,}

## Testing model's accuracy

In [26]:
from sklearn import metrics
def tests(models, data, iterations = 100):
    results = {}
    for i in models:
        r2_train = []
        r2_test = []
        for j in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(X, 
                                                                y, 
                                                                test_size= 0.2)
            r2_test.append(metrics.r2_score(y_test,
                                            models[i].fit(X_train, 
                                                         y_train).predict(X_test)))
            r2_train.append(metrics.r2_score(y_train, 
                                             models[i].fit(X_train, 
                                                          y_train).predict(X_train)))
        results[i] = [np.mean(r2_train), np.mean(r2_test)]
    return pd.DataFrame(results)

In [27]:
tests(models3, X)

Unnamed: 0,OLS,Lasso,Ridge
0,0.999275,0.999513,0.999465
1,0.999152,0.99941,0.99937


## Fitting pipe3 i.e Ridge Regression on the dataset

In [28]:
pipe3.fit(X, y)

Pipeline(memory=None,
     steps=[('poly', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('fit', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [29]:
p = pipe3.predict(test)

In [30]:
p = pd.DataFrame(p, columns={'current price'})

In [31]:
p = pd.concat([test_id, p], axis=1)

In [32]:
p.shape

(100, 2)

In [33]:
p.to_csv('Ridge.csv', index=False)