In [None]:
import logging
import pandas as  pd
log_file=r"C:\Users\Rasulbek907\Desktop\Final_Project\Log\error_analysis.log"

In [None]:
logging.basicConfig(
    filename=log_file,
    filemode='a',
    level=logging.INFO,
    format="%(asctime)s-%(levelname)s-%(message)s"
)
logging.info('Analysis started!')

In [None]:
import os
import sys
sys.path.append(r"C:\Users\Rasulbek907\Desktop\Final_Project\Source")

In [None]:
from data_loader import DataLoader

In [None]:
path = r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Feature_selection"
loader = DataLoader(path)

In [None]:

df = loader.load_datasets()

In [None]:

print(df.info())

In [None]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

In [None]:
num_col = df.select_dtypes(include=[np.number]).columns.drop('life_span').tolist()
cat_col = df.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
for col in cat_col:
    df[col]=df[col].astype(str)

In [None]:
if 'life_span' not in df.columns:
    print("⚠️ Column 'life_span' not found. Available columns are:")
    print(df.columns.tolist())
else:
    
    X = df.drop('life_span', axis=1)
    y = df['life_span']

In [None]:
num_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',MinMaxScaler())

])

In [None]:

num_pipeline

In [None]:
cat_pipeline=Pipeline([
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
])

In [None]:
cat_pipeline

In [None]:
preprocessor=ColumnTransformer([
    
        ('num',num_pipeline,num_col),
        ('cat',cat_pipeline,cat_col)
    
])

In [None]:

preprocessor

In [None]:
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor(random_state=42))
])

In [None]:

pipeline

In [None]:
print("NaN soni y_train ichida:", y_train.isna().sum())

In [None]:
# NaN bo‘lgan qatorlarni olib tashlash
train_data = pd.concat([x_train, y_train], axis=1)
train_data.dropna(subset=[y_train.name], inplace=True)

In [None]:
# Yangi tozalangan versiyalar
x_train = train_data.drop(columns=[y_train.name])
y_train = train_data[y_train.name]

In [None]:
train_data = pd.concat([x_train, y_train], axis=1)
train_data.dropna(subset=[y_train.name if y_train.name else 'target'], inplace=True)

x_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]

In [None]:

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

In [None]:
pipeline.fit(x_train, y_train)

In [None]:
print("NaN soni y_test ichida:", y_test.isna().sum())

In [None]:
print(f"Tozalangandan keyin test_data hajmi: {test_data.shape}")

In [None]:
# Faqat y_test dagi NaN bo‘lganlarni chiqarib tashlaymiz
mask = y_test.notna()

x_test_clean = x_test[mask]
y_test_clean = y_test[mask]

print("Tozalangandan keyin test_data hajmi:", x_test_clean.shape)

In [None]:
# Modeldan bashorat qilish
y_pred = pipeline.predict(x_test_clean)

# Baholash
from sklearn.metrics import r2_score, mean_absolute_error

r2 = r2_score(y_test_clean, y_pred)
mae = mean_absolute_error(y_test_clean, y_pred)

print("✅ R2 score:", r2)
print("✅ MAE:", mae)

In [None]:
print(f"r2 is {r2:.2f}")
print(f"mae is  {mae:.2f}")

# Error Analysis

In [None]:
import numpy as np

# errors va abs_errors ni hisoblaymiz
errors = y_test_clean - y_pred
abs_errors = np.abs(errors)

# Natijalarni jadval ko‘rinishida ko‘rsatish uchun pandas DataFrame yasaymiz
import pandas as pd

errors_df = pd.DataFrame({
    'Actual': y_test_clean,
    'Predicted': y_pred,
    'Error': errors,
    'Absolute Error': abs_errors
})

print(errors_df.head())

In [None]:
errors_df = errors_df.reset_index(drop=True)
x_test_reset = x_test_clean.reset_index(drop=True)

In [None]:
full_errors_df = pd.concat([x_test_reset, errors_df], axis=1)

In [None]:
print(full_errors_df.head())

In [None]:

print(full_errors_df.columns)

In [None]:
error_cols = full_errors_df[['name', 'Error', 'Absolute Error']]
print(error_cols.head(10))

# Worst Prediction

In [None]:
# 4️⃣ Sort by largest absolute errors
worst_predictions = full_errors_df.sort_values(by='Absolute Error', ascending=False).head(10)

print("🔎 Top 10 Worst Predictions:")
display(worst_predictions[['Actual', 'Predicted', 'Error', 'Absolute Error']])

# Error Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
plt.figure(figsize=(8, 5))
sns.histplot(errors, bins=30, kde=True)
plt.title("Distribution of Prediction Errors")
plt.xlabel("Error (Actual - Predicted)")
plt.ylabel("Frequency")
plt.show()

# Actual vs Predicted

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6, 6))
sns.scatterplot(x=y_test_clean, y=y_pred)
plt.plot([y_test_clean.min(), y_test_clean.max()], [y_test_clean.min(), y_test_clean.max()], 'r--')  # Ideal chiziq
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Scatterplot")
plt.show()