In [1]:
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn import model_selection

In [2]:
# Excel dosyasını oku
df = pd.read_excel("machineLearning_excel.xlsx")

In [3]:
# "Unnamed: 0" sütununu sil, hata olursa görmezden gel
df.drop("Unnamed: 0", axis=1, inplace=True, errors='ignore')

In [4]:
categorical_columns = ['Name', 'Dealer_Name', 'Review_Count']

In [5]:
for col in categorical_columns:
    df[col] = df[col].astype('category').cat.codes


In [6]:
# DataFrame'i kopyala
df_2 = df.copy()
print(df_2.columns)

Index(['Name', 'Mileage', 'Dealer_Name', 'Rating', 'Review_Count', 'Price'], dtype='object')


In [7]:
X = df_2.drop(['Price'], axis = 1)
y = df_2["Price"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 144)

In [9]:
xgb = XGBRegressor(enable_categorical=True)

In [10]:
params = {"colsample_bytree":[0.4,0.5,0.6],
         "learning_rate":[0.01,0.02,0.09],
         "max_depth":[2,3,4,5,6],
         "n_estimators":[100,200,500,2000]}

In [11]:
grid = GridSearchCV(xgb, params, cv = 10, n_jobs = -1, verbose = 2)

In [12]:
grid.fit(X_train, y_train)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


In [13]:
grid.best_params_

{'colsample_bytree': 0.6,
 'learning_rate': 0.09,
 'max_depth': 2,
 'n_estimators': 2000}

In [14]:
#En uygun parametreler girilyor
xgb1 = XGBRegressor(colsample_bytree = 0.5, learning_rate = 0.09, max_depth = 4, n_estimators = 2000)

In [15]:
model_xgb = xgb1.fit(X_train, y_train) # Modelimizi eğitiyoruz

In [16]:
model_xgb.predict(X_test)[15:20]

array([77778.67 , 38070.355, 42841.89 , 71888.85 , 38515.785],
      dtype=float32)

In [17]:
y_test[15:20] # Tahmin edilen ve gerçek verileri karşılaştırabiliriz

889    69662
502    56998
844    47995
238    46977
231    38485
Name: Price, dtype: int64

In [18]:
model_xgb.score(X_test, y_test) # Modelin scorunu hesapladık

0.6178825165427566

In [19]:
model_xgb.score(X_train, y_train)

0.9993310275148605

In [20]:
# Doğrulanmış hata oranımız buluyoruz (valide edilmiş)
np.sqrt(-1*(cross_val_score(model_xgb, X_test, y_test, cv=10, scoring = "neg_mean_squared_error"))).mean()

33437.494189415636

In [21]:
# Modelde parametrelerin önemini görüyoruz
importance = pd.DataFrame({"Importance" : model_xgb.feature_importances_},
                         index = X_train.columns)

In [22]:
importance

Unnamed: 0,Importance
Name,0.472392
Mileage,0.23411
Dealer_Name,0.089843
Rating,0.093169
Review_Count,0.110486
