In [362]:
#importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
# from google.colab import drive
# drive.mount("/content/drive")

In [363]:
df= pd.read_csv("/content/drive/My Drive/Ford Car Price Prediction Pre-processed.csv")
df.head()

Unnamed: 0,model,price,transmission,mileage,fuelType,mpg,engineSize
0,0,10000,0,48141,0,61.4,1.0
1,1,11561,0,18803,0,56.5,1.0
2,2,13500,0,12065,0,54.3,1.0
3,1,11000,0,20978,0,65.7,1.0
4,3,17999,1,9002,1,54.3,2.0


In [364]:
#Seperating x and y as features and labels
X = df.iloc[:, [0, 2, 3, 4, 5, 6]].values
y = df.iloc[:, 1].values

In [365]:
#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [366]:
#Applying Linear Regression
clf = LinearRegression()
clf.fit(X_train, y_train)
y_prediction = clf.predict(X_test)
y_prediction

array([11483.36025787, 15964.70651329, 11136.90687761, ...,
       14785.18794052, 12473.31575788, 12109.27067098])

In [367]:
#Evaluating Linear Regression
mse = mean_squared_error(y_test, y_prediction)
rmse = np.sqrt(mse)
print(f"MSE: {mse}\nRMSE: {rmse}")

MSE: 9882725.490923107
RMSE: 3143.6802462914557


In [368]:
#Converting the continuous output into categorical
price_threshold = df["price"].median()
df["price"] = np.where(df["price"]<price_threshold, 0, 1) # 1 means high price and 0 indicates low price

for i in ["mileage", "mpg"]:
  df[i] = pd.cut(df[i], bins=[-np.inf, df[i].quantile(0.25), df[i].median(), df[i].quantile(0.75), np.inf],
                 labels=[0, 1, 2, 3])  # labels 0 to 3 refers low to high mileage/mpg

df["engineSize"] = pd.cut(df["engineSize"], bins=[0, 1, 2, 3, 4, 5, np.inf], labels=[0, 1, 2, 3, 4, 5], right=False) #All the engine-sizes are less or equal 5

In [369]:
#Preparing the training set
X = df.iloc[:, [0, 2, 3, 4, 5, 6]].values
y = df.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [370]:
#Applying Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions

array([0, 1, 0, ..., 1, 1, 1])

In [371]:
# #Evaluating Logistic Regression
print(accuracy_score(y_test, predictions))

0.7792754844144903
