# Basic data exploration

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# read csv file and store as data frame in memory
df = pd.read_csv("./datasets/used_cars_UK.csv")
df.head()

In [None]:
# display basic info (num of entries, columns etc.) about dataframe 
df.info()

In [None]:
df.describe()

In [None]:
# display names of the columns
df.columns

# Check for duplicates

In [None]:
duplicates = df[df.duplicated(subset=['title', 'Price', 'Mileage(miles)'])]

if duplicates.empty:
    print("No duplicates found.")
else:
    print(f"{len(duplicates)} duplicates found.")

# remove duplicates
df = df.drop_duplicates(subset=['title', 'Price', 'Mileage(miles)'])

# Check for missing values

In [None]:
df_no_svc_history = df.drop(labels=['Service history'], axis=1)

rows_with_missing_values = df_no_svc_history.isna().any(axis=1).sum()
print(rows_with_missing_values)

# Determine if data is normally distributed

## Mileage

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 3))
sns.histplot(df['Mileage(miles)'], kde=True)
plt.title('Histogram for Mileage')
plt.show()

In [None]:
mean = df['Mileage(miles)'].mean()
median = df['Mileage(miles)'].median()
print(mean, median)

In [None]:
from scipy.stats import shapiro

stat, p = shapiro(df['Mileage(miles)'])
print(p)
if p > 0.05:
    print("Mileage is likely normally distributed")
else:
    print("Mileage is not normally distributed")

## Registration Year

In [None]:
plt.figure(figsize=(12, 3))
sns.histplot(df['Registration_Year'], kde=True)
plt.title('Histogram for Registration Year')
plt.show()

In [None]:
mean = df['Registration_Year'].mean()
median = df['Registration_Year'].median()
print(mean, median)

In [None]:
from scipy.stats import shapiro

stat, p = shapiro(df['Registration_Year'])
print(p)
if p > 0.05:
    print("Registration Year is likely normally distributed")
else:
    print("Registration Year is not normally distributed")

## Engine

In [None]:
plt.figure(figsize=(12, 3))
sns.histplot(df['Engine'], kde=True)
plt.title('Histogram for Engine')
plt.show()

## Owners

In [None]:
plt.figure(figsize=(12, 3))
sns.histplot(df['Previous Owners'], kde=True)
plt.title('Histogram for Previous Owners')
plt.show()

## Doors

In [None]:
plt.figure(figsize=(12, 3))
sns.histplot(df['Doors'], kde=True)
plt.title('Histogram for Doors')
plt.show()

## Seats

In [None]:
plt.figure(figsize=(12, 3))
sns.histplot(df['Seats'], kde=True)
plt.title('Histogram for Seats')
plt.show()

# Handle missing values

In [None]:
# remove rows with missing prices as this is the target variable
df.dropna(subset=['Price'], inplace=True)

# drop unnamed column
df.drop("Unnamed: 0", axis=1, inplace=True)

# drop title column because irrelevant
df.drop("title", axis=1, inplace=True)

# drop service history column because only "full" or Nan
df.drop("Service history", axis=1, inplace=True)


# Filling missing values

In [None]:
# Convert the 'Engine Volume' column to a numerical data type (float)
df['Engine'] = df['Engine'].str.rstrip('L').astype(float)

In [None]:
# For nominal and ordinal variables, such as "Fuel type", "Body type", "Emission Class", "Gearbox" and "Service history"
# we can impute missing values with the mode (most frequent category) since these variables represent categories without a natural order.
df['Fuel type'].fillna(df['Fuel type'].mode()[0], inplace=True)
df['Body type'].fillna(df['Body type'].mode()[0], inplace=True)
df['Emission Class'].fillna(df['Emission Class'].mode()[0], inplace=True)
df['Gearbox'].fillna(df['Gearbox'].mode()[0], inplace=True)

In [None]:
# For interval/ratio variables, such as "Mileage", "Registration_Year", "Previous Owners", "Engine", "Doors", and "Seats" 
# we can impute missing values with the mean or median. The choice between mean and median depends on the distribution of the data.
# If the data is normally distributed, we can use the mean; if not, we use the median.
df['Mileage(miles)'].fillna(df['Mileage(miles)'].median(), inplace=True)
df['Registration_Year'].fillna(df['Registration_Year'].median(), inplace=True)
df['Previous Owners'].fillna(df['Previous Owners'].median(), inplace=True)
df['Engine'].fillna(df['Engine'].median(), inplace=True)
df['Doors'].fillna(df['Doors'].median(), inplace=True)
df['Seats'].fillna(df['Seats'].median(), inplace=True)

# One-hot encoding for categorical/nominal values

In [None]:
df = pd.get_dummies(df, columns=['Gearbox', 'Emission Class', 'Fuel type', 'Body type'])

# Split the data into features (X) and the target (y)

In [None]:
X = df.drop(labels=['Price'],axis=1).values
y = df[['Price']].values

## Training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

# split the data frame into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Create and train linear model

In [None]:
from sklearn.linear_model import LinearRegression

# create and a linear model based on multiple independent variables
model = LinearRegression()
model.fit(X_train, y_train)

# Model evaluation

In [None]:
from sklearn.metrics import r2_score

# calculate R2-score
y_test_pred = model.predict(X_test)
r2 = r2_score(y_test, y_test_pred)
print(f"R-squared (R2): {r2}")