In [None]:
import pandas as pd
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Final_Project\Log\training.log"

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

try:
    logging.info("CSV fayl o'qilmoqda:...")
    df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Feature_selection\Filtered_Features.csv")
    logging.info(f"Fayl muvaffaqiyatli o'qildi. Satırlar soni: {len(df)} ustunlar soni: {len(df.columns)}")
except Exception as e:
    logging.error(f"CSV faylni o'qishda xatolik: {e}")

In [None]:
df.head()

# SKEWNESNI ANIQLASH 

In [None]:
# faqat raqamli ustunlar uchun
skewness = df.select_dtypes(include=['number']).skew()

# Natijani ko‘rish
print("📊 Skewness (egiklik) qiymatlari:\n")
print(skewness)


# Log transform faqat musbat qiymatlar uchun 

In [None]:
import numpy as np
df['life_span_log'] = np.log1p(df['life_span']) 

# Box -Cox transform

In [None]:
from scipy.stats import boxcox
df['life_span_boxcox'], fitted_lambda = boxcox(df['life_span'] + 1)  
print(f"Optimal lambda for Box-Cox: {fitted_lambda}")


In [None]:
from scipy.stats import boxcox
import numpy as np

skewed_cols = ['life_span', 'birth_year', 'death_year', 'alma_mater', 'children']  # misol uchun

for col in skewed_cols:
    min_val = df[col].min()
    if min_val <= 0:
        # Minimal qiymatni musbatga ko'chirish uchun
        shifted = df[col] + abs(min_val) + 1
        df[col+'_boxcox'], _ = boxcox(shifted)
    else:
        df[col+'_boxcox'], _ = boxcox(df[col])


# Models Training Process 

# RandomForestRegressor TRAINING 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# X va y ajratamiz
X = df.drop('life_span', axis=1)
y = df['life_span']

# Train / Validation / Test (60/20/20)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  

# Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Baholash
y_pred = rf.predict(X_val)
print("📊 Random Forest Validation R2:", r2_score(y_val, y_pred))
print("📉 Validation MSE:", mean_squared_error(y_val, y_pred))


# Save RandomForestRegressor Model 

In [None]:
import joblib
joblib.dump(rf, r"C:\Users\Rasulbek907\Desktop\Final_Project\Models\Simple_Models\RandomForestRegressor.joblib")
print("✅ Model muvaffaqiyatli saqlandi.")

# DecisionTreeRegressor TRAINING 

In [None]:
from sklearn.tree import DecisionTreeRegressor

# Model
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# Baholash
y_pred_dt = dt.predict(X_val)
print("📊 Decision Tree Validation R2:", r2_score(y_val, y_pred_dt))
print("📉 Validation MSE:", mean_squared_error(y_val, y_pred_dt))

# Save DecisionTreeRegressor Model 

In [None]:
import joblib
joblib.dump(dt, r"C:\Users\Rasulbek907\Desktop\Final_Project\Models\Simple_Models\DecisionTreeRegressor.joblib")
print("✅ Model muvaffaqiyatli saqlandi.")

# Jami: Natijalarni Taqqoslash 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.graph_objects as go

# 1. X va y
X = df.drop('life_span', axis=1)
y = df['life_span']

# 2. Train, validation, test bo'lish
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# 3. Modellar
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    r2 = r2_score(y_val, preds)
    mse = mean_squared_error(y_val, preds)
    results.append({'Model': name, 'R2 Score': r2, 'MSE': mse})

# 4. Natijalarni DataFrame ga yig'ish
results_df = pd.DataFrame(results)

# 5. Eng yaxshi R2 topish
best_model_index = results_df['R2 Score'].idxmax()

# 6. Jadvalni chizish (qizil bilan eng yaxshisi)
colors = ['lightgray'] * len(results_df)
colors[best_model_index] = 'crimson'  # eng yaxshi R2 uchun qizil rang

fig = go.Figure(data=[go.Table(
    header=dict(values=list(results_df.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[results_df[col] for col in results_df.columns],
               fill_color=[colors, colors, colors],
               align='left'))
])

fig.update_layout(title='Model Natijalari: R2 Score va MSE')
fig.show()

# Cros Validation Tekshirish Natijalari 

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Model
dt = DecisionTreeRegressor(random_state=42)

# R2 Score bo‘yicha 5-fold cross-validation
r2_scores = cross_val_score(dt, X, y, cv=5, scoring='r2')
print("📊 R2 (5-fold):", r2_scores)
print("📈 R2 O'rtacha:", np.mean(r2_scores))

# Cross-val prediction orqali MSE hisoblash
y_pred_cv = cross_val_predict(dt, X, y, cv=5)
mse = mean_squared_error(y, y_pred_cv)
print("📉 Cross-Validated MSE:", mse)

# Shap Value Bo`yicha Taxliliy Xulosalar 

In [None]:
import shap
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor

# Modelni o'qitamiz (cross_val uchun emas, alohida fit qilamiz)
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X, y)

# SHAP explainer yaratamiz
explainer = shap.Explainer(dt_model, X)
shap_values = explainer(X)

# Summary plot (eng muhim xususiyatlarni ko‘rsatadi)
shap.summary_plot(shap_values, X)

### ✅ XULOSA

🔍 Yuqoridagi tahliliy natijalardan kelib chiqib, quyidagi xulosani chiqarishimiz mumkin:

📈 **Modelimiz samaradorligini oshirishda aynan o‘zimiz tomonidan yaratilgan quyidagi ustunlar muhim rol o‘ynamoqda**:

- `life_span_boxcox` ⚙️  
- `life_span_log` 📊  
- `life_span_cluster` 🔗  
- `occupation_cluster` 🧠  
- `birth_year_boxcox` 🕰️  
- `death_year_boxcox` ⚰️  

🚀 Ushbu ustunlar **modelimizning eng muhim TOP-10 xususiyatlari** qatoriga kirgan va ularning mavjudligi **aniqlikni sezilarli darajada oshirgan**.

🏅 Shuningdek, `awards` ustuni ham **ahamiyatli omillardan biri** sifatida ajralib turibdi.

---

🎯 **Xulosa qilib aytganda**, **to‘g‘ri xususiyatlar muhandisligi (feature engineering)** model samaradorligining asosiy omillaridan biri bo‘lib xizmat qilmoqda. Bu — muvaffaqiyat sari muhim qadam! 💡📊
