In [1]:
import pandas as pd
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Project_8\Log\training.log"

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

try:
    logging.info("CSV fayl o'qilmoqda:...")
    df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_8\Data\Preprocessing\Preprosessed.csv")
    logging.info(f"Fayl muvaffaqiyatli o'qildi. Satırlar soni: {len(df)} ustunlar soni: {len(df.columns)}")
except Exception as e:
    logging.error(f"CSV faylni o'qishda xatolik: {e}")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,dateCrawled,name,price,vehicleType,yearOfRegistration,powerPS,model,kilometer,...,seller_gewerblich,seller_privat,offerType_Angebot,offerType_Gesuch,abtest_control,abtest_test,gearbox_automatik,gearbox_manuell,notRepairedDamage_ja,notRepairedDamage_nein
0,0,0.0,0.586679,0.339173,2.235174e-07,0.857143,0.110346,0.0,0.472,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,1,3e-06,0.58572,0.017608,8.521602e-06,0.428571,0.112346,0.0095,0.472,0.827586,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,2,5e-06,0.272956,0.387805,4.563481e-06,1.0,0.111568,0.00815,0.476,0.827586,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,3,8e-06,0.378625,0.327975,6.984919e-07,0.571429,0.111235,0.00375,0.472,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,4,1.1e-05,0.824117,0.730069,1.676381e-06,0.571429,0.112012,0.00345,0.412,0.586207,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              371528 non-null  int64  
 1   index                   371528 non-null  float64
 2   dateCrawled             371528 non-null  float64
 3   name                    371528 non-null  float64
 4   price                   371528 non-null  float64
 5   vehicleType             371528 non-null  float64
 6   yearOfRegistration      371528 non-null  float64
 7   powerPS                 371528 non-null  float64
 8   model                   371528 non-null  float64
 9   kilometer               371528 non-null  float64
 10  monthOfRegistration     371528 non-null  float64
 11  fuelType                371528 non-null  float64
 12  brand                   371528 non-null  float64
 13  dateCreated             371528 non-null  float64
 14  nrOfPictures        

# TRAINING 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Regression models
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
# from sklearn.svm import SVR
import xgboost as xgb 

# --------------------------
# 1️⃣ X va y ajratish
# --------------------------
target_col = 'kilometer'
X = df.drop(target_col, axis=1)
y = df[target_col]

# Train / Validation / Test (60/20/20)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# --------------------------
# 2️⃣ Modellarni yaratish
# --------------------------
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Linear Regression": LinearRegression(),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    # "SVR": SVR(),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# --------------------------
# 3️⃣ Model trening va baholash
# --------------------------
results = []

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    # Predict
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    
    # Baholash
    r2_val = r2_score(y_val, y_pred_val)
    mse_val = mean_squared_error(y_val, y_pred_val)
    r2_test = r2_score(y_test, y_pred_test)
    mse_test = mean_squared_error(y_test, y_pred_test)
     
    results.append({
        "Model": name,
        "R2_Validation": r2_val,
        "MSE_Validation": mse_val,
        "R2_Test": r2_test,
        "MSE_Test": mse_test
    })

# --------------------------
# 4️⃣ Natijalarni jadval shaklida chiqarish
# --------------------------
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="R2_Validation", ascending=False))

               Model  R2_Validation  MSE_Validation   R2_Test  MSE_Test
0      Random Forest       0.551346        0.034233  0.543290  0.034657
4            XGBoost       0.541080        0.035016  0.530146  0.035655
1      Decision Tree       0.069070        0.071032  0.080864  0.069748
2  Linear Regression       0.036597        0.073509  0.035963  0.073155
3      KNN Regressor      -0.201831        0.091702 -0.201901  0.091205


In [9]:
import pandas as pd

# Birinchi natijalar (masalan diskret target)
df1 = pd.DataFrame({
    "Model": ["XGBoost", "Random Forest", "KNN Regressor", "Linear Regression", "Decision Tree"],
    "R2_Validation": [0.513549, 0.503607, 0.414279, 0.350926, 0.011635],
    "MSE_Validation": [5.404332e+06, 5.514777e+06, 6.507188e+06, 7.211020e+06, 1.098045e+07],
    "R2_Test": [0.514259, 0.505278, 0.415860, 0.353131, 0.015696],
    "MSE_Test": [5.386329e+06, 5.485914e+06, 6.477468e+06, 7.173060e+06, 1.091484e+07]
})

# Ikkinchi natijalar (masalan continuous target)
df2 = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost", "Decision Tree", "Linear Regression", "KNN Regressor"],
    "R2_Validation": [0.551346, 0.541080, 0.069070, 0.036597, -0.201831],
    "MSE_Validation": [0.034233, 0.035016, 0.071032, 0.073509, 0.091702],
    "R2_Test": [0.543290, 0.530146, 0.080864, 0.035963, -0.201901],
    "MSE_Test": [0.034657, 0.035655, 0.069748, 0.073155, 0.091205]
})

# Har bir jadvalga izoh qo'shish
df1["Target_Type"] = "Continuous"
df2["Target_Type"] = "Diskret"

# Ikkalasini birlashtirish
comparison_df = pd.concat([df1, df2], ignore_index=True)

# Model bo‘yicha tartiblash
comparison_df = comparison_df.sort_values(by=["Target_Type", "R2_Validation"], ascending=[True, False])

# Natijani ko‘rsatish
print(comparison_df)


               Model  R2_Validation  MSE_Validation   R2_Test      MSE_Test  \
0            XGBoost       0.513549    5.404332e+06  0.514259  5.386329e+06   
1      Random Forest       0.503607    5.514777e+06  0.505278  5.485914e+06   
2      KNN Regressor       0.414279    6.507188e+06  0.415860  6.477468e+06   
3  Linear Regression       0.350926    7.211020e+06  0.353131  7.173060e+06   
4      Decision Tree       0.011635    1.098045e+07  0.015696  1.091484e+07   
5      Random Forest       0.551346    3.423300e-02  0.543290  3.465700e-02   
6            XGBoost       0.541080    3.501600e-02  0.530146  3.565500e-02   
7      Decision Tree       0.069070    7.103200e-02  0.080864  6.974800e-02   
8  Linear Regression       0.036597    7.350900e-02  0.035963  7.315500e-02   
9      KNN Regressor      -0.201831    9.170200e-02 -0.201901  9.120500e-02   

  Target_Type  
0  Continuous  
1  Continuous  
2  Continuous  
3  Continuous  
4  Continuous  
5     Diskret  
6     Diskret  
7 

In [10]:
import pandas as pd

# Misol uchun oldingi comparison_df DataFrame
# comparison_df tayyor deb olamiz

# R2_Validation va R2_Test uchun eng katta → yashil, eng kichik → qizil
# MSE_Validation va MSE_Test uchun eng kichik → yashil, eng katta → qizil

def highlight_best_worst(df):
    # R2 ustunlari (katta yaxshi)
    r2_cols = ['R2_Validation', 'R2_Test']
    # MSE ustunlari (kichik yaxshi)
    mse_cols = ['MSE_Validation', 'MSE_Test']

    def style_r2(val, col):
        if val == df[col].max():
            return 'background-color: lightgreen'
        elif val == df[col].min():
            return 'background-color: lightcoral'
        else:
            return ''

    def style_mse(val, col):
        if val == df[col].min():
            return 'background-color: lightgreen'
        elif val == df[col].max():
            return 'background-color: lightcoral'
        else:
            return ''

    # Har bir ustun uchun style qo‘llash
    styled = df.style
    for col in r2_cols:
        styled = styled.applymap(lambda val: style_r2(val, col), subset=[col])
    for col in mse_cols:
        styled = styled.applymap(lambda val: style_mse(val, col), subset=[col])

    return styled

# Jadvalni rang bilan ko‘rsatish
highlight_best_worst(comparison_df)


  styled = styled.applymap(lambda val: style_r2(val, col), subset=[col])
  styled = styled.applymap(lambda val: style_mse(val, col), subset=[col])


Unnamed: 0,Model,R2_Validation,MSE_Validation,R2_Test,MSE_Test,Target_Type
0,XGBoost,0.513549,5404332.0,0.514259,5386329.0,Continuous
1,Random Forest,0.503607,5514777.0,0.505278,5485914.0,Continuous
2,KNN Regressor,0.414279,6507188.0,0.41586,6477468.0,Continuous
3,Linear Regression,0.350926,7211020.0,0.353131,7173060.0,Continuous
4,Decision Tree,0.011635,10980450.0,0.015696,10914840.0,Continuous
5,Random Forest,0.551346,0.034233,0.54329,0.034657,Diskret
6,XGBoost,0.54108,0.035016,0.530146,0.035655,Diskret
7,Decision Tree,0.06907,0.071032,0.080864,0.069748,Diskret
8,Linear Regression,0.036597,0.073509,0.035963,0.073155,Diskret
9,KNN Regressor,-0.201831,0.091702,-0.201901,0.091205,Diskret
