In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from gplearn.genetic import SymbolicRegressor, SymbolicTransformer
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("car_sales_data.csv")

In [None]:
print("First five rows: \n")
df.head()

In [None]:
print("Data informations: \n")
df.info()

In [None]:
print("Data descriptions: \n")
df.describe()

In [None]:
df_encoded = pd.get_dummies(df, columns=["Fuel type", "Manufacturer", "Model"], drop_first=True)

X = df_encoded.drop(columns=["Price"])
y = df_encoded["Price"]

X[["Year of manufacture"]] = 2025 - X[["Year of manufacture"]]
X = X.rename(columns={'Year of manufacture': 'Age of car'})


X[["Engine size", "Mileage", "Age of car"]] = ((X[["Engine size", "Mileage", "Age of car"]] - X[["Engine size", "Mileage", "Age of car"]].mean()) /
                                               X[["Engine size", "Mileage", "Age of car"]].std())
y = (y - y.mean()) / y.std()

In [None]:
function_set = ['add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv',
                'max', 'min']
transformer = SymbolicTransformer(generations=20,
                         population_size=2000,
                         hall_of_fame=100,
                         n_components=10,
                         function_set=function_set,
                         parsimony_coefficient=0.0005,
                         max_samples=0.9,
                         verbose=1,
                         random_state=0,
                         n_jobs=3)

transformer.fit(X, y)

X_new = transformer.fit_transform(X, y)
X = np.hstack((X, X_new))

In [None]:
X = pd.DataFrame(X)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
est_gp = SymbolicRegressor(
    population_size=5000,
    generations=30,
    p_crossover=0.7,
    p_subtree_mutation=0.1,
    p_hoist_mutation=0.1,
    p_point_mutation=0.1,
    max_samples=0.9,
    verbose=1,
    tournament_size=20,
    parsimony_coefficient=0.001,
    stopping_criteria=0.01,
    metric='mse',
    n_jobs=-1,
    random_state=42
)

est_gp.fit(X_train, y_train)

In [None]:
y_pred_train = pd.Series(est_gp.predict(X_train), index=X_train.index)
y_pred_train.head()

In [None]:
y_train.head()

In [None]:
y_pred_test = pd.Series(est_gp.predict(X_test), index=X_test.index)
y_pred_test.head()

In [None]:
y_test.head()

In [None]:
print("R2 Score of Train Samples: ", est_gp.score(X_train, y_train))
print("R2 Score of Test Samples: ", est_gp.score(X_test, y_test))

In [None]:
# Toplam örnek sayısı
n = len(y_train)

# Rastgele 1000 indeks seç (tekrarsız)
idx = np.random.choice(n, size=5000, replace=False)

# Alt küme oluştur
y_train_sample = y_train.iloc[idx] if hasattr(y_train, "iloc") else y_train[idx]
y_pred_sample = y_pred_train.iloc[idx] if hasattr(y_pred_train, "iloc") else y_pred_train[idx]

# Scatter plot
plt.scatter(y_train_sample, y_pred_sample, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.show()

In [None]:
# Toplam örnek sayısı
n = len(y_test)

# Rastgele 1000 indeks seç (tekrarsız)
idx = np.random.choice(n, size=5000, replace=False)

# Alt küme oluştur
y_test_sample = y_test.iloc[idx] if hasattr(y_test, "iloc") else y_test[idx]
y_pred_sample = y_pred_test.iloc[idx] if hasattr(y_pred_test, "iloc") else y_pred_test[idx]

# Scatter plot
plt.scatter(y_test_sample, y_pred_sample, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.show()