In [None]:
from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import hdbscan
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import missingno as msno

In [None]:
X_train = pd.read_csv('DataSets/X_train.csv')
y_train = pd.read_csv('DataSets/y_train.csv')
X_test = pd.read_csv('DataSets/X_test.csv')
y_test = pd.read_csv('DataSets/y_test.csv')

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(score)

In [None]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
#
# tsne = TSNE(n_components=2, random_state=42)
# X_embedded = tsne.fit_transform(X_train)
#
# plt.scatter(X_embedded[:,0], X_embedded[:,1], c=labels)
# plt.show()

In [None]:
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
#
# tsne = TSNE(n_components=2, random_state=42)
# X_embedded = tsne.fit_transform(X_test)
#
# plt.scatter(X_embedded[:,0], X_embedded[:,1], c=labels)
# plt.show()

In [None]:
msno.matrix(X_train)
plt.show()

In [None]:
df = pd.concat([X_train, y_train], axis=1)

In [None]:
df_new, df_insignificant = train_test_split(
    df,
    test_size=0.9,
    random_state=42,
    shuffle=True,
    stratify=df['Permit Process Time']
)

In [None]:
msno.matrix(df_new)
plt.show()

In [None]:
df_new['Permit Process Time'].value_counts()

In [None]:
X_train = df_new.drop(columns=['Permit Process Time'])
y_train = df_new['Permit Process Time']

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
cluster_labels_train = clusterer.fit_predict(X_train)
cluster_labels_test = clusterer.fit_predict(X_test)

In [None]:
X_train['cluster'] = cluster_labels_train
X_test['cluster'] = cluster_labels_test

In [None]:
tsne = TSNE(n_components=2, random_state=42)
X_embedded = tsne.fit_transform(X_train)

# Görselleştirme
plt.figure(figsize=(10,7))
scatter = plt.scatter(X_embedded[:,0], X_embedded[:,1], c=cluster_labels, cmap='tab10', s=5)
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.title("t-SNE ile HDBSCAN Küme Görselleştirme")
plt.show()

In [None]:
function_set = ['add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv',
                'max', 'min']
transformer = SymbolicTransformer(generations=20,
                                  population_size=5000,
                                  hall_of_fame=100,
                                  n_components=10,
                                  function_set=function_set,
                                  parsimony_coefficient=0.0001,
                                  max_samples=0.9,
                                  verbose=1,
                                  random_state=42,
                                  n_jobs=-1)

transformer.fit(X_train, y_train)

In [None]:
# SymbolicTransformer sonrası X_train zaten sayısal
X_train_transformed = transformer.transform(X_train)

# Artık bir regresyon modeli ile eğitebilirsin
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42, n_jobs=-1)
model.fit(X_train_transformed, y_train)


In [None]:
est_gp = SymbolicRegressor(
    population_size=5000,
    generations=20,
    p_crossover=0.7,
    p_subtree_mutation=0.1,
    p_hoist_mutation=0.1,
    p_point_mutation=0.1,
    max_samples=0.9,
    verbose=1,
    tournament_size=20,
    parsimony_coefficient=0.001,
    stopping_criteria=0.01,
    metric='mse',
    n_jobs=-1,
    random_state=42
)

est_gp.fit(X_train, y_train)

In [None]:
y_pred_train = pd.Series(est_gp.predict(X_train), index=X_train.index)
y_pred_train.head()

In [None]:
y_train.head()

In [None]:
transformer.fit(X_test, y_test)

In [None]:
y_pred_test = pd.Series(est_gp.predict(X_test), index=X_test.index)
y_pred_test.head()

In [None]:
y_test.head()

In [None]:
print("R2 Score of train samples whit the model: ", est_gp.score(X_train, y_train))
print("R2 Score of test samples whit the model: ", est_gp.score(X_test, y_test))

In [None]:
plt.scatter(y_train, y_pred_train, alpha=0.5)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Train Actual vs Predicted Prices')
plt.show()

In [None]:
plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Test Actual vs Predicted Prices')
plt.show()