In [2]:
import numpy as np
import pandas as pd

In [3]:
cars =  pd.read_csv("cleaned_data.csv")
cars.columns

Index(['Name', 'style', 'Exterior color', 'interior color', 'Engine',
       'drive type', 'Fuel Type', 'Transmission', 'Mileage', 'mpg city',
       'mpg highway', 'price', 'Year', 'Engine V', 'Brand'],
      dtype='object')

In [4]:
X =  cars[['Name', 'style', 'Exterior color', 'interior color', 'Engine',
       'drive type', 'Fuel Type', 'Transmission', 'Mileage', 'mpg city',
       'mpg highway', 'Year', 'Engine V', 'Brand']]


Y = cars["price"].values

In [5]:
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder(categories="auto", handle_unknown="ignore")

categorical_features = onehot.fit_transform(X.iloc[:, [1,4,5,6,7,13]]).toarray()
X = np.delete(X.values, [0,1,2,3,4,5,6,7,13], 1)
X = np.concatenate((X,categorical_features), axis=1)

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    X, Y,
    test_size=0.1,
    random_state=42
)

In [7]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
std_scaler.fit(x_train)

x_train_std = std_scaler.transform(x_train)
x_test_std  = std_scaler.transform(x_test)

In [8]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

results = list()

for i in range(2,25):
    pca = PCA(n_components=i)
    lr = LinearRegression(
        fit_intercept=True,
        normalize="deprecated"
    )

    x_train_pca = pca.fit_transform(x_train_std)
    x_test_pca  = pca.transform(x_test_std)

    lr.fit(x_train_pca, y_train)
    result = [i, lr.score(x_test_pca, y_test)]

    results.append(result)

In [9]:
results = sorted(
    results,
    key= lambda x: (-x[1], x[0])
)

results_df = pd.DataFrame(
    results, 
    columns=["n principle component", "score"]
)
results_df[:3]

Unnamed: 0,n principle component,score
0,24,0.714404
1,23,0.710446
2,22,0.70114


In [10]:
from sklearn.pipeline import make_pipeline

pip_lr = make_pipeline(
    StandardScaler(),
    PCA(),
    LinearRegression(fit_intercept=True, normalize="deprecated")
)

pip_lr.fit(x_train_std, y_train)
pip_lr.score(x_test_std, y_test)

print("Test Accuracy : {:.3f}".format(pip_lr.score(x_test_std, y_test)))

Test Accuracy : 0.856
