<a href="https://colab.research.google.com/github/semih108/car-price-prediction/blob/main/pricePrediction_clean_train_predict3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [85]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [86]:
import pandas as pd
import os

# Step 1: Define the path where your CSV files are stored
csv_dir = '/content/drive/MyDrive/scraped_car_data'

# Step 2: List all CSV files in the directory
csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')]

# Step 3: Load each CSV file into a DataFrame and append it to a list
data_frames = []
for file in csv_files:
    file_path = os.path.join(csv_dir, file)
    df = pd.read_csv(file_path)
    data_frames.append(df)

# Step 4: Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(data_frames, ignore_index=True)

# Step 5: Save the combined DataFrame to a new CSV file (optional)
combined_df.to_csv('/content/drive/MyDrive/combined_car_data2.csv', index=False)


In [87]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import json

# Load Data
data = pd.read_csv('/content/drive/MyDrive/scraped_car_data/audi_car_prices.csv')
#data = pd.read_csv('/content/drive/MyDrive/combined_car_data2.csv')

# Function to clean data
def clean_data(df):
    df['Price'] = df['Price'].str.replace(r'[^\d]', '', regex=True)
    df['Mileage'] = df['Mileage'].str.replace(r'[^\d]', '', regex=True)
    df['Fuel'] = df['Fuel'].replace({'Elektro/Benzin': 'Hybrid'})
    power = df['Power'].str.extract(r'(\d+) kW \((\d+) PS\)')
    df['Power_PS'] = power[1]
    df.drop(columns=['Power'], inplace=True)
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
    df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')
    df['Power_PS'] = pd.to_numeric(df['Power_PS'], errors='coerce')
    df['Date'] = pd.to_datetime(df['Date'].apply(lambda x: x if re.match(r'\d{2}/\d{4}', x) else None), format='%m/%Y', errors='coerce')
    data['Car_Age'] = data['Date'].dt.year
    data.drop(columns=['Date'], inplace=True)
    # Extract subType using the API data
    #df['subType'] = df.apply(lambda x: find_subtype(x['Name'].lower(), x['Make'].lower(), x['Model'].lower(), all_model_details), axis=1)

    # Impute missing values for subType using a placeholder
    #df['subType'] = df['subType'].fillna('unknown')
    return df

data = clean_data(data)

# Save cleaned data to a CSV file
cleaned_data_path = '/content/drive/MyDrive/cleaned_car_data.csv'
data.to_csv(cleaned_data_path, index=False)
print(f'Cleaned data saved to {cleaned_data_path}')


Cleaned data saved to /content/drive/MyDrive/cleaned_car_data.csv


In [88]:

bmw_model = {
    '1': '1er',
    '2': '2er',
    '3': '3er',
    '4': '4er',
    '5': '5er',
    '6': '6er',
    '7': '7er',
    '8': '8er',
    'i3': 'i3',
    'i4': 'i4',
    'i7': 'i7',
    'i8': 'i8',
    'iX': 'iX',
    'iX1': 'iX1',
    'iX3': 'iX3',
    'X1': 'X1',
    'X2': 'X2',
    'X3': 'X3',
    'X4': 'X4',
    'X5': 'X5',
    'X6': 'X6',
    'X7': 'X7',
    'XM': 'XM',
    'Z1': 'Z1',
    'Z3': 'Z3',
    'Z4': 'Z4',
    'Z8': 'Z8'
}
mercedes_model = {
    '123': '123',
    '190': '190/190 E',
    'A': 'A-Klasse',
    'A-Limousine': 'A-Klasse Limousine',
    'AMG GT': 'AMG GT',
    'AMG GT 4-door': 'AMG GT 4-door',
    'B': 'B-Klasse',
    'C': 'C-Klasse',
    'C-All-Terrain': 'C-Klasse All-Terrain',
    'Citan': 'Citan',
    'CL': 'CL-Coupe',
    'CLA': 'CLA-Klasse',
    'CLC': 'CLC-Klasse',
    'CLK': 'CLK-Klasse',
    'CLS': 'CLS-Klasse',
    'E': 'E-Klasse',
    'E-All-Terrain': 'E-Klasse All-Terrain',
    'EQA': 'EQA',
    'EQB': 'EQB',
    'EQC': 'EQC',
    'EQE': 'EQE',
    'EQS': 'EQS',
    'EQV': 'EQV',
    'G': 'G-Klasse',
    'GL': 'GL-Klasse',
    'GLA': 'GLA-Klasse',
    'GLB': 'GLB-Klasse',
    'GLC': 'GLC-Klasse',
    'GLE': 'GLE-Klasse',
    'GLK': 'GLK-Klasse',
    'GLS': 'GLS-Klasse',
    'ML': 'M-Klasse',
    'MB': 'MB 100',
    'R': 'R-Klasse',
    'S': 'S-Klasse',
    'SL': 'SL-Klasse',
    'SLC': 'SLC-Klasse',
    'SLK': 'SLK-Klasse',
    'SLR': 'SLR McLaren',
    'SLS': 'SLS AMG',
    'Sprinter': 'Sprinter',
    'Strich Acht': 'Strich Acht',
    'T1': 'T1 Transporter',
    'T': 'T-Klasse',
    'V': 'V-Klasse',
    'V-Marco Polo': 'V-Klasse Marco Polo',
    'Vaneo': 'Vaneo',
    'Viano': 'Viano',
    'Vito': 'Vito',
    'Vito Tourer': 'Vito Tourer',
    'X': 'X-Klasse'
}
audi_model = {
    #'e-tron-gt': 'e-tron'
    'q4-e-tron': 'q4',
    'q8-e-tron': 'q8',
    'rs-q3': 'rs q3',
    'rs-q8': 'rs q8',
    'rs2': 'rs 2',
    'rs3': 'rs 3',
    'rs4': 'rs 4',
    'rs5': 'rs 5',
    'rs6': 'rs 6',
    'rs7': 'rs 7',
    'tt-rs': 'tt rs',
    'a6-allroad': 'a6 allroad',
    'a4-allroad': 'a4 allroad'
}

In [89]:
#für testzwecke
import re
import pandas as pd
import numpy as np

file_path = '/content/drive/MyDrive/DataFromApi.json'
with open(file_path, 'r') as f:
    all_model_details = json.load(f)
#print(all_model_details)

def convert_to_lowercase(data):
    if isinstance(data, dict):
        return {key.lower(): convert_to_lowercase(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_to_lowercase(item) for item in data]
    elif isinstance(data, str):
        return data.lower()
    else:
        return data

def find_subtype2(name, make, model, all_model_details):
    name = name.lower().replace(" ", "").replace(",", ".")
    name = name.replace(make, "").replace(model, "")
    make = make.lower()
    model = model.lower()
    all_model_details = convert_to_lowercase(all_model_details)
    matching_subtype = None
    if make in all_model_details:
        if model in all_model_details[make]:
            subtypes = all_model_details[make][model]
            for subtype in subtypes:
                subtype_parts = subtype.lower().replace(",", ".").split()
                if all(part.replace(" ", "") in name for part in subtype_parts):
                    if matching_subtype is None or len(subtype) > len(matching_subtype):
                        matching_subtype = subtype
    return matching_subtype

print(find_subtype2('tdi 3.0 v6', 'audi', 'a5', all_model_details))

3.0 v6 tdi


In [90]:
import re
import pandas as pd
import numpy as np

def convert_to_lowercase(data):
    if isinstance(data, dict):
        return {key.lower(): convert_to_lowercase(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_to_lowercase(item) for item in data]
    elif isinstance(data, str):
        return data.lower()
    else:
        return data

file_path = '/content/drive/MyDrive/DataFromApi.json'
with open(file_path, 'r') as f:
    all_model_details = json.load(f)
    all_model_details = convert_to_lowercase(all_model_details)



def find_subtype2(name, make, model):
    name = name.lower().replace(" ", "").replace(",", ".")
    name = name.replace(make, "").replace(model, "")
    make = make.lower()
    model = model.lower()
    matching_subtype = None
    if make in all_model_details:
        if model in all_model_details[make]:
            subtypes = all_model_details[make][model]
            if len(subtypes) == 1:
                return subtypes[0]
            for subtype in subtypes:
                subtype_parts = subtype.lower().replace(",", ".").split()
                 # Überprüfe, ob alle Teile des Subtypes im Namen vorkommen
                if all(part.replace(" ", "") in name for part in subtype_parts):
                    if matching_subtype is None or len(subtype) > len(matching_subtype):
                        matching_subtype = subtype

                # Berechne die Anzahl der passenden Teile, wenn nur ein Teil passt
                matching_parts = sum(1 for part in subtype_parts if part.replace(" ", "") in name)

                # Wenn mindestens 2 Teile übereinstimmen, den Subtyp in Erwägung ziehen
                if matching_parts >= 2:
                    if matching_subtype is None or len(subtype) > len(matching_subtype):
                        matching_subtype = subtype
    return matching_subtype

def update_golf_model(row):
    golf_years = [(1974, 1983, 'I'), (1983, 1992, 'II'), (1991, 1997, 'III'),
                  (1997, 2003, 'IV'), (2003, 2008, 'V'), (2008, 2012, 'VI'),
                  (2012, 2019, 'VII'), (2019, 9999, 'VIII')]
    if row['Make'].lower() == 'volkswagen' and 'golf' in row['Model'].lower():
        for start, end, roman in golf_years:
            if start <= row['Car_Age'] <= end:
                return f'Golf {roman}'
    return row['Model']

# Load the data from the CSV file
file_path = '/content/drive/MyDrive/cleaned_car_data.csv'
data = pd.read_csv(file_path)

def updateModelName(row):
    if row['Make'].lower() == 'bmw':
        first_char = row['Model'][0]
        return bmw_model.get(first_char, row['Model'])
    if row['Make'].lower() == 'mercedes-benz':
        first_char = row['Model'].split()[0]
        return mercedes_model.get(first_char, row['Model'])
    if row['Make'].lower() == 'audi':
        sportback_models = ['a5', 'e-tron', 'q3', 'q4','q5','q8','rs q3','sq5','sq8']  # Liste der Modelle, für die du die Überprüfung machen möchtest
        for model in sportback_models:
            if f'{model}sb' in row['Name'].lower() or f'{model}sportback' in row['Name'].lower():
                return f'{model} sportback'
        return audi_model.get(row['Model'], row['Model'])

# Update data
data['Model'] = data.apply(update_golf_model, axis=1)
data['Model'] = data.apply(updateModelName, axis=1) #BMW, Mercedes

# Apply the logic to set Detected_Subtype
data['Detected_Subtype'] = data.apply(lambda row: find_subtype2(row['Name'], row['Make'], row['Model']), axis=1)

data.drop(columns=['Name'], inplace=True)
data['Car_Age'] = 2024 - data['Car_Age'] # have to calculate it here because of golf model

# Save the cleaned data to a CSV file
improved_subtype = '/content/drive/MyDrive/cleaned_data_with_subtype.csv'
data.to_csv(improved_subtype, index=False)
print(f'Cleaned data saved to {improved_subtype}')

Cleaned data saved to /content/drive/MyDrive/cleaned_data_with_subtype.csv


In [91]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Load the cleaned data into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/cleaned_data_with_subtype.csv')

# Shuffle the data
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

nan_sum = data.isna().sum()
print(nan_sum)

# Handle categorical data
le_transmission = LabelEncoder()
le_fuel = LabelEncoder()
le_make = LabelEncoder()
le_model = LabelEncoder()
le_subtype = LabelEncoder()

# Apply label encoding to categorical columns
data['Transmission'] = le_transmission.fit_transform(data['Transmission'].str.lower())
data['Fuel'] = le_fuel.fit_transform(data['Fuel'].str.lower())
data['Make'] = le_make.fit_transform(data['Make'].str.lower())
data['Model'] = le_model.fit_transform(data['Model'].str.lower())
data['Detected_Subtype'] = le_subtype.fit_transform(data['Detected_Subtype'].str.lower().fillna(''))

# Prepare features and target
X = data.drop(columns=['Price'])
y = data['Price']

imputer_x = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer_x.fit_transform(X), columns=X.columns)

imputer_y = SimpleImputer(strategy='median')
y = pd.Series(imputer_y.fit_transform(y.values.reshape(-1, 1)).ravel(), name='Price')

# Save the feature names
feature_names = X.columns.tolist()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate Model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Save the model and label
directory = '/content/drive/MyDrive/models'
joblib.dump(feature_names, os.path.join(directory, 'feature_names.pkl'))
joblib.dump(model, os.path.join(directory, 'car_price_predictor.pkl'))
joblib.dump(le_transmission, os.path.join(directory, 'le_transmission.pkl'))
joblib.dump(le_fuel, os.path.join(directory, 'le_fuel.pkl'))
joblib.dump(le_make, os.path.join(directory, 'le_make.pkl'))
joblib.dump(le_model, os.path.join(directory, 'le_model.pkl'))
joblib.dump(le_subtype, os.path.join(directory, 'le_subtype.pkl'))


Make                  0
Model                 0
Price               186
Mileage               1
Transmission          0
Fuel                  0
Power_PS              0
Car_Age              73
Detected_Subtype    707
dtype: int64
Mean Absolute Error: 4825.126652077606


['/content/drive/MyDrive/models/le_subtype.pkl']

In [96]:
import numpy as np
import joblib
import pandas as pd

# Load the model, label encoders, and feature names
model = joblib.load('/content/drive/MyDrive/models/car_price_predictor.pkl')
le_transmission = joblib.load('/content/drive/MyDrive/models/le_transmission.pkl')
le_fuel = joblib.load('/content/drive/MyDrive/models/le_fuel.pkl')
le_make = joblib.load('/content/drive/MyDrive/models/le_make.pkl')
le_model = joblib.load('/content/drive/MyDrive/models/le_model.pkl')
le_subtype = joblib.load('/content/drive/MyDrive/models/le_subtype.pkl')
feature_names = joblib.load('/content/drive/MyDrive/models/feature_names.pkl')

# Function to check for unseen labels
def check_label(encoder, label):
    label_lower = label.lower()
    if label_lower not in [cls.lower() for cls in encoder.classes_]:
        raise ValueError(f"Label '{label}' not found in encoder classes: {encoder.classes_}")
    return encoder.transform([label_lower])[0]

# Function to predict car price
def predict_car_price(make, model_name, mileage, transmission, fuel, power_ps, car_age, subtype):
    # Check and transform categorical inputs
    make_encoded = check_label(le_make, make)
    model_encoded = check_label(le_model, model_name)
    transmission_encoded = check_label(le_transmission, transmission)
    fuel_encoded = check_label(le_fuel, fuel)
    subtype_encoded = check_label(le_subtype, subtype)

    # Prepare the feature array with correct column names and order
    features = pd.DataFrame([[make_encoded, model_encoded, mileage, transmission_encoded, fuel_encoded, power_ps, car_age, subtype_encoded]], columns=feature_names)

    # Debug: Print the features to check correctness
    print("Features for prediction:")
    print(features)

    # Predict the price
    predicted_price = model.predict(features)[0]

    return predicted_price

# Example usage
try:
    make = 'audi'  # example brand
    model_name = 'a5 sportback'  # example model
    mileage = 161000  # example mileage (corrected unrealistic value)
    transmission = 'automatik'  # example transmission
    fuel = 'diesel'  # example fuel
    power_ps = 190  # example power in PS
    car_age = 7  # example car age
    subtype = '2.0 tdi'

    predicted_car_price = predict_car_price(make, model_name, mileage, transmission, fuel, power_ps, car_age, subtype)
    print(f'The predicted price of the car is: € {predicted_car_price:.2f}')
except ValueError as e:
    print(f'Error: {e}')


Features for prediction:
   Make  Model  Mileage  Transmission  Fuel  Power_PS  Car_Age  \
0     0     10   161000             1     2       190        7   

   Detected_Subtype  
0                19  
The predicted price of the car is: € 22561.95
