In [1]:
import pandas as pd
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Project_MP\Log\feature_selection.log"

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

try:
    logging.info("CSV fayl o'qilmoqda:...")
    df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Enginered_Data\feature_enginering.csv")
    logging.info(f"Fayl muvaffaqiyatli o'qildi. Satırlar soni: {len(df)} ustunlar soni: {len(df.columns)}")
except Exception as e:
    logging.error(f"CSV faylni o'qishda xatolik: {e}")

In [None]:
import pandas as pd
import plotly.express as px

# Raqamli ustunlarni tanlash
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# faqat raqamli ustunlar bo'yicha korrelyatsiya
corr_matrix = df[numeric_cols].corr()

# life_span ustunining boshqa ustunlar bilan korrelyatsiyasi
life_span_corr = corr_matrix[['life_span']].sort_values(by='life_span', ascending=False)

# Heatmap uchun DataFrame ni transformatsiya qilish
heatmap_data = life_span_corr.reset_index()
heatmap_data.columns = ['Feature', 'Correlation_with_life_span']

# Plotly Express yordamida bar grafik
fig = px.bar(
    heatmap_data,
    x='Feature',
    y='Correlation_with_life_span',
    title='life_span bilan boshqa raqamli ustunlar o\'rtasidagi korrelyatsiya',
    labels={'Correlation_with_life_span': 'Korrelyatsiya qiymati', 'Feature': 'Xususiyatlar'}
)

fig.show()


In [None]:
import numpy as np
import plotly.figure_factory as ff

# Qiymatlarni 2 xonagacha qisqartirish
annot_text = np.around(corr_matrix.values, decimals=2).astype(str)

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=annot_text,
    colorscale='Viridis',
    showscale=True
)

fig.update_layout(title='Raqamli ustunlar o\'rtasidagi korrelyatsiya matritsasi')
fig.show()

# Feature Selection Filtering 

In [None]:
import seaborn as sns               
import matplotlib.pyplot as plt     


corr_matrix = df.corr().abs()  

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  


to_drop = [col for col in upper.columns if any(upper[col] > 0.8)]  
corr_matrix = df.corr().abs() 
print(corr_matrix)  


In [None]:

import numpy as np   

corr_matrix = df.corr().abs()  

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  

to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]  

df_filtered = df.drop(columns=to_drop)  
print("\nOriginal shape:", df.shape)  

print("Shape after dropping correlated features:", df_filtered.shape) 


In [None]:
df_filtered.info()

In [None]:
# Yuqori korrelyatsiyaga ega juftliklarni saqlash uchun bo‘sh ro‘yxat yaratamiz
high_corr_pairs = []


for col in upper.columns:
    
    for row in upper.index:
        
        if upper.loc[row, col] is not np.nan and upper.loc[row, col] > 0.95:
            
            high_corr_pairs.append([row, col, round(upper.loc[row, col], 2)])


corr_table = pd.DataFrame(high_corr_pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])

print("Highly correlated feature pairs (correlation > 0.95):")
print(corr_table)

In [None]:
import plotly.express as px  

corr_long = corr_matrix.reset_index().melt(id_vars='index')  

corr_long.columns = ['Feature 1', 'Feature 2', 'Correlation']  

fig = px.imshow(
    corr_matrix,                  
    text_auto='.2f',              
    aspect="auto",                
    color_continuous_scale='RdBu_r',  
    zmin=-1, zmax=1,             
    title="Correlation Matrix Heatmap (Interactive)"  
)


fig.update_layout(
    width=900,          
    height=800,          
    xaxis_title="Features",  
    yaxis_title="Features"   
)

fig.show()

# Low Variance

In [None]:
# Low Variance
from sklearn.feature_selection import VarianceThreshold  
threshold = 0.01  
selector = VarianceThreshold(threshold=threshold)  
selector.fit(df_filtered)  
low_variance_features = df_filtered.columns[~selector.get_support()] 
df_low_variance_filtered = df_filtered.drop(columns=low_variance_features)  
print("\nOriginal shape:", df_filtered.shape)  
print("Shape after dropping low-variance features:", df_low_variance_filtered.shape)  

In [None]:
# Past variansli ustunlarni aniqlash:

low_variance_features = df_filtered.columns[~selector.get_support()]
print("Features with low variance (to drop):", list(low_variance_features))

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
variances = df[numeric_cols].var()
threshold = 0.01
low_variance_features = variances[variances < threshold].index.tolist()
var_df = pd.DataFrame({
    'Feature': variances.index,
    'Variance': variances.values,
    'LowVariance': ['Yes' if f in low_variance_features else 'No' for f in variances.index]
})


fig = px.bar(
    var_df,
    x='Feature',        
    y='Variance',       
    color='LowVariance',  
    color_discrete_map={'Yes': 'red', 'No': 'blue'},  
    text='Variance',    
    title='Feature Variance (Low-Variance Highlighted in Red)'  
)


fig.update_layout(
    xaxis_tickangle=-45,
    width=1000,
    height=600
)

fig.show()

In [None]:
df_filtered.info()

In [None]:
df_filtered.to_csv(r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Feature_selection\Filtered_Features.csv")

# Feature Selection Wrapper 

 # RFE + Linear Regression

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

X = df.drop('life_span', axis=1)
y = df['life_span']

In [None]:
# Oddiy chiziqli regressiya modelini yaratamiz (bu model RFE uchun asos bo'ladi)
model = LinearRegression()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]
print("Selected features:", selected_features)

# RFE + RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

model = RandomForestRegressor()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X, y)

selected_features = X.columns[rfe.support_]
print("Selected features (RandomForest):", selected_features)


# RFE + Decision Tree Regressor 

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X, y)

selected_features = X.columns[rfe.support_]
print("Selected features (DecisionTree):", selected_features)


In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Model va ustun sonlarini saqlash uchun bo'sh ro'yxat
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor()    
}

results = []

# X va y aniq ekanligini faraz qilamiz
for name, model in models.items():
    rfe = RFE(model, n_features_to_select=20)  # 20 ta ustun tanlaymiz misol uchun
    rfe.fit(X, y)
    selected_count = sum(rfe.support_)
    results.append({'Model': name, 'Selected Features Count': selected_count})

# Natijalarni DataFrame ko'rinishida chiqaramiz
results_df = pd.DataFrame(results)
print(results_df)


In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import os

X = df.drop('life_span', axis=1)
y = df['life_span']

model = LinearRegression()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X, y)

selected_features = X.columns[rfe.support_]

# Tanlangan ustunlarni DataFrame ga aylantirish
selected_df = pd.DataFrame(selected_features, columns=['Selected Features'])

# Saqlash uchun yo'l
save_path = r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Feature_selection"
os.makedirs(save_path, exist_ok=True)  # Agar papka mavjud bo'lmasa, yaratadi

# To'liq fayl nomi
file_path = os.path.join(save_path, "Wrapper.csv")

# CSV ga saqlash
selected_df.to_csv(file_path, index=False)

print(f"Tanlangan ustunlar {file_path} ga saqlandi.")


# Feature Selection Embedded + Lasso CV

In [None]:
# Xususiyat tanlash uchun Linear Model'larda ishlatiladigan usul (LASSO regression)

# LassoCV (L1-regularizatsiya) ni import qilamiz
# CV - cross-validation, u model uchun optimal alpha (regulyarizatsiya kuchi) tanlab beradi
from sklearn.linear_model import LassoCV
X = df.drop('life_span', axis=1)
y = df['life_span']
lasso = LassoCV(cv=10, random_state=42).fit(X, y)
importance = np.abs(lasso.coef_)
selected_features = X.columns[importance > 0]
print("Selected features using Lasso (non-zero coefficients):")
print(selected_features.tolist())

In [None]:
# Faqat muhim featurelarni saqlab qolish
percentile_threshold = np.percentile(importance, 90) 
top_features = X.columns[importance >= percentile_threshold]
print("\nTop 25% important features based on Lasso coefficients:")
print(top_features.tolist())

In [None]:
# Feature importance'ni jadval shaklida tayyorlash

feat_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
})

# Importance qiymatlariga qarab kamayish tartibida (eng muhimdan kam muhimga qarab) saralaymiz
feat_df = feat_df.sort_values(by='Importance', ascending=False)
percentile_threshold = np.percentile(importance, 85)
feat_df['Top15%'] = np.where(feat_df['Importance'] >= percentile_threshold, 'Yes', 'No')
print("Feature Importance Table:")
print(feat_df)

In [None]:
fig = px.bar(
    feat_df,                
    x='Feature',            
    y='Importance',         
    color='Top15%',         
    color_discrete_map={'Yes': 'red', 'No': 'blue'},  
    text='Importance',      
    title='Lasso Feature Importance'  
)


fig.update_layout(
    xaxis_tickangle=-45,  
    width=1000,           
    height=600            
)

fig.show()

# Feature Selection Embedded + Random Forest Regression 

In [None]:
# --- Tree based (Random Forest) Feature Importance ---

# Random Forest Regressorni import qilamiz
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=500, random_state=42)

rf.fit(X, y)

importances = rf.feature_importances_
feat_df = pd.DataFrame({
    'Feature': X.columns,       
    'Importance': importances   
})
feat_df = feat_df.sort_values(by='Importance', ascending=False)
percentile_threshold = np.percentile(importances, 85)
feat_df['Top15%'] = np.where(feat_df['Importance'] >= percentile_threshold, 'Yes', 'No')
print("Random Forest Feature Importance Table:")
print(feat_df)

In [None]:
# Plotly Express yordamida Random Forest feature importance'ni grafik ko'rinishida chiqaramiz
fig = px.bar(
    feat_df,                
    x='Feature',            
    y='Importance',         
    color='Top15%',         
    color_discrete_map={'Yes': 'red', 'No': 'blue'},  
    text='Importance',      
    title='Random Forest Feature Importance'  
)


fig.update_layout(
    xaxis_tickangle=-45,  
    width=1000,           
    height=600            
)

fig.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV
import os

X = df.drop('life_span', axis=1)
y = df['life_span']

# LassoCV modeli
lasso = LassoCV(cv=10, random_state=42).fit(X, y)

# Importance qiymatlari (koeffitsientlarning mutlaq qiymati)
importance = np.abs(lasso.coef_)

# Nolga teng bo'lmagan koeffitsientlar bo'yicha tanlangan ustunlar
selected_features = X.columns[importance > 0]

print("Selected features using Lasso (non-zero coefficients):")
print(selected_features.tolist())

# Top 25% muhim ustunlar (percentile orqali)
percentile_threshold = np.percentile(importance, 90)
top_features = X.columns[importance >= percentile_threshold]

print("\nTop 10% important features based on Lasso coefficients:")
print(top_features.tolist())

# Feature importance jadvali yaratish
feat_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
})

# Muqaddaslik bo'yicha kamayish tartibida saralash
feat_df = feat_df.sort_values(by='Importance', ascending=False)

# 85 foizlik percentile chegarasini belgilash
percentile_threshold_85 = np.percentile(importance, 90)
feat_df['Top15%'] = np.where(feat_df['Importance'] >= percentile_threshold_85, 'Yes', 'No')

print("Feature Importance Table:")
print(feat_df)

# CSV saqlash uchun papka yaratish (agar mavjud bo'lmasa)
save_path = r"C:\Users\Rasulbek907\Desktop\Final_Project\Data\Feature_selection"
os.makedirs(save_path, exist_ok=True)

# Fayl nomlari
selected_file = os.path.join(save_path, "Lasso_Selected_Features.csv")
importance_file = os.path.join(save_path, "Lasso_Feature_Importance.csv")

# Tanlangan ustunlarni CSV ga saqlash
pd.DataFrame(selected_features, columns=['Selected Features']).to_csv(selected_file, index=False)

# Importance jadvalini CSV ga saqlash
feat_df.to_csv(importance_file, index=False)

print(f"\nTanlangan ustunlar {selected_file} ga saqlandi.")
print(f"Feature importance jadvali {importance_file} ga saqlandi.")
