In [1]:
import pandas as pd
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Project_MP\Log\feature_selection.log"

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

try:
    logging.info("CSV fayl o'qilmoqda:...")
    df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Preprosessed\Preprosessed.csv")
    logging.info(f"Fayl muvaffaqiyatli o'qildi. Satırlar soni: {len(df)} ustunlar soni: {len(df.columns)}")
except Exception as e:
    logging.error(f"CSV faylni o'qishda xatolik: {e}")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4993 entries, 0 to 4992
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             4993 non-null   int64  
 1   name_length            4993 non-null   float64
 2   is_organic             4993 non-null   float64
 3   is_sugar_free          4993 non-null   float64
 4   quantity_value         4993 non-null   float64
 5   quantity_unit_encoded  4993 non-null   float64
 6   category_depth         4993 non-null   float64
 7   country_count          4993 non-null   float64
 8   product_age_days       4993 non-null   float64
 9   created_year           4993 non-null   float64
 10  created_month          4993 non-null   float64
 11  main_category_encoded  4993 non-null   float64
 12  nova_group             4993 non-null   float64
dtypes: float64(12), int64(1)
memory usage: 507.2 KB


In [3]:
import pandas as pd
import plotly.express as px

# Raqamli ustunlarni tanlash
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# faqat raqamli ustunlar bo'yicha korrelyatsiya
corr_matrix = df[numeric_cols].corr()

# nova_group ustunining boshqa ustunlar bilan korrelyatsiyasi
nova_group_corr = corr_matrix[['nova_group']].sort_values(by='nova_group', ascending=False)

# Heatmap uchun DataFrame ni transformatsiya qilish
heatmap_data = nova_group_corr.reset_index()
heatmap_data.columns = ['Feature', 'Correlation_with_nova_group']

# Plotly Express yordamida bar grafik
fig = px.bar(
    heatmap_data,
    x='Feature',
    y='Correlation_with_nova_group',
    title='nova_group bilan boshqa raqamli ustunlar o\'rtasidagi korrelyatsiya',
    labels={'Correlation_with_nova_group': 'Korrelyatsiya qiymati', 'Feature': 'Xususiyatlar'}
)

fig.show()


In [4]:
import numpy as np
import plotly.figure_factory as ff

# Qiymatlarni 2 xonagacha qisqartirish
annot_text = np.around(corr_matrix.values, decimals=2).astype(str)

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=annot_text,
    colorscale='Viridis',
    showscale=True
)

fig.update_layout(title='Raqamli ustunlar o\'rtasidagi korrelyatsiya matritsasi')
fig.show()

# Feature Selection Filtering 

In [7]:
import seaborn as sns               
import matplotlib.pyplot as plt     


corr_matrix = df.corr().abs()  

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  


to_drop = [col for col in upper.columns if any(upper[col] > 0.8)]  
corr_matrix = df.corr().abs() 
print(corr_matrix)  


                       Unnamed: 0  name_length  is_organic  is_sugar_free  \
Unnamed: 0               1.000000     0.054962    0.026883            NaN   
name_length              0.054962     1.000000    0.050684            NaN   
is_organic               0.026883     0.050684    1.000000            NaN   
is_sugar_free                 NaN          NaN         NaN            NaN   
quantity_value           0.009135     0.029352    0.030572            NaN   
quantity_unit_encoded    0.007060     0.035211    0.007833            NaN   
category_depth           0.006788     0.123338    0.036659            NaN   
country_count            0.184384     0.056088    0.019306            NaN   
product_age_days         0.075281     0.130750    0.001548            NaN   
created_year             0.058906     0.126628    0.005676            NaN   
created_month            0.015908     0.015911    0.018174            NaN   
main_category_encoded    0.014981     0.065067    0.022490            NaN   

In [13]:
import numpy as np   

# 1️⃣ Korrelyatsiya matritsasini hisoblash
corr_matrix = df.corr().abs()  

# 2️⃣ Yuqori uchburchak (upper triangle) ni olish — o‘zaro takrorlanishni oldini oladi
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  

# 3️⃣ 0.95 dan yuqori korrelyatsiyaga ega ustunlarni aniqlash
to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]  

# 4️⃣ Ustunlarni olib tashlash
df_filtered = df.drop(columns=to_drop)  

# 5️⃣ Natijalarni chiqarish
print("\n🧩 Original shape:", df.shape)  
print("✅ Shape after dropping correlated features:", df_filtered.shape)  

# 6️⃣ O‘chirilgan ustunlar ro‘yxatini chiqarish
if to_drop:
    print("\n❌ Dropped columns due to high correlation (>0.95):")
    for col in to_drop:
        print(f" - {col}")
else:
    print("\n✅ No highly correlated columns found.")



🧩 Original shape: (4993, 13)
✅ Shape after dropping correlated features: (4993, 12)

❌ Dropped columns due to high correlation (>0.95):
 - created_year


In [14]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4993 entries, 0 to 4992
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             4993 non-null   int64  
 1   name_length            4993 non-null   float64
 2   is_organic             4993 non-null   float64
 3   is_sugar_free          4993 non-null   float64
 4   quantity_value         4993 non-null   float64
 5   quantity_unit_encoded  4993 non-null   float64
 6   category_depth         4993 non-null   float64
 7   country_count          4993 non-null   float64
 8   product_age_days       4993 non-null   float64
 9   created_month          4993 non-null   float64
 10  main_category_encoded  4993 non-null   float64
 11  nova_group             4993 non-null   float64
dtypes: float64(11), int64(1)
memory usage: 468.2 KB


In [15]:
# Yuqori korrelyatsiyaga ega juftliklarni saqlash uchun bo‘sh ro‘yxat yaratamiz
high_corr_pairs = []


for col in upper.columns:
    
    for row in upper.index:
        
        if upper.loc[row, col] is not np.nan and upper.loc[row, col] > 0.95:
            
            high_corr_pairs.append([row, col, round(upper.loc[row, col], 2)])


corr_table = pd.DataFrame(high_corr_pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])

print("Highly correlated feature pairs (correlation > 0.95):")
print(corr_table)

Highly correlated feature pairs (correlation > 0.95):
          Feature 1     Feature 2  Correlation
0  product_age_days  created_year         0.98


In [16]:
import plotly.express as px  

corr_long = corr_matrix.reset_index().melt(id_vars='index')  

corr_long.columns = ['Feature 1', 'Feature 2', 'Correlation']  

fig = px.imshow(
    corr_matrix,                  
    text_auto='.2f',              
    aspect="auto",                
    color_continuous_scale='RdBu_r',  
    zmin=-1, zmax=1,             
    title="Correlation Matrix Heatmap (Interactive)"  
)


fig.update_layout(
    width=900,          
    height=800,          
    xaxis_title="Features",  
    yaxis_title="Features"   
)

fig.show()

# Low Variance

In [18]:
# 📉 Low Variance Feature Removal
from sklearn.feature_selection import VarianceThreshold  

# 1️⃣ Threshold qiymati (kerakli minimal varians)
threshold = 0.01  

# 2️⃣ VarianceThreshold obyektini yaratamiz
selector = VarianceThreshold(threshold=threshold)  

# 3️⃣ Modelni o‘qitamiz
selector.fit(df_filtered)  

# 4️⃣ Qaysi ustunlar olib tashlanganini aniqlaymiz
low_variance_features = df_filtered.columns[~selector.get_support()] 

# 5️⃣ Ushbu ustunlarni olib tashlaymiz
df_low_variance_filtered = df_filtered.drop(columns=low_variance_features)  

# 6️⃣ Natijalarni chiqaramiz
print("\n🧩 Original shape:", df_filtered.shape)  
print("✅ Shape after dropping low-variance features:", df_low_variance_filtered.shape)  

# 7️⃣ O‘chirilgan ustunlarni chiqarish
if len(low_variance_features) > 0:
    print("\n❌ Dropped low-variance features (variance < {:.2f}):".format(threshold))
    for col in low_variance_features:
        print(f" - {col}")
else:
    print("\n✅ No low-variance features found.")


🧩 Original shape: (4993, 12)
✅ Shape after dropping low-variance features: (4993, 11)

❌ Dropped low-variance features (variance < 0.01):
 - is_sugar_free


In [23]:
numeric_cols = df_low_variance_filtered.select_dtypes(include=[np.number]).columns
variances = df_low_variance_filtered[numeric_cols].var()
threshold = 0.01
low_variance_features = variances[variances < threshold].index.tolist()
var_df_low_variance_filtered = pd.DataFrame({
    'Feature': variances.index,
    'Variance': variances.values,
    'LowVariance': ['Yes' if f in low_variance_features else 'No' for f in variances.index]
})


fig = px.bar(
    var_df_low_variance_filtered,
    x='Feature',        
    y='Variance',       
    color='LowVariance',  
    color_discrete_map={'Yes': 'red', 'No': 'blue'},  
    text='Variance',    
    title='Feature Variance (Low-Variance Highlighted in Red)'  
)


fig.update_layout(
    xaxis_tickangle=-45,
    width=1000,
    height=600
)

fig.show()

In [22]:
df_low_variance_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4993 entries, 0 to 4992
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             4993 non-null   int64  
 1   name_length            4993 non-null   float64
 2   is_organic             4993 non-null   float64
 3   quantity_value         4993 non-null   float64
 4   quantity_unit_encoded  4993 non-null   float64
 5   category_depth         4993 non-null   float64
 6   country_count          4993 non-null   float64
 7   product_age_days       4993 non-null   float64
 8   created_month          4993 non-null   float64
 9   main_category_encoded  4993 non-null   float64
 10  nova_group             4993 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 429.2 KB


In [24]:
df_low_variance_filtered.to_csv(r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Feature_Selection\Filtered_Features.csv")

# Feature Selection Wrapper 

 # RFE + Linear Regression

In [30]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

X = df.drop('nova_group', axis=1)
y = df['nova_group']

In [31]:
# Oddiy chiziqli regressiya modelini yaratamiz (bu model RFE uchun asos bo'ladi)
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]
print("Selected features:", selected_features)

Selected features: Index(['Unnamed: 0', 'name_length', 'is_organic', 'is_sugar_free',
       'quantity_value', 'quantity_unit_encoded', 'category_depth',
       'country_count', 'product_age_days', 'created_year', 'created_month',
       'main_category_encoded'],
      dtype='object')



Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.


lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



# RFE + RandomForestClassifier

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X, y)

selected_features = X.columns[rfe.support_]
print("Selected features (RandomForest):", selected_features)



Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.



Selected features (RandomForest): Index(['Unnamed: 0', 'name_length', 'is_organic', 'is_sugar_free',
       'quantity_value', 'quantity_unit_encoded', 'category_depth',
       'country_count', 'product_age_days', 'created_year', 'created_month',
       'main_category_encoded'],
      dtype='object')


# RFE + DecisionTreeClassifier

In [33]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
rfe = RFE(model, n_features_to_select=20)
rfe.fit(X, y)

selected_features = X.columns[rfe.support_]
print("Selected features (DecisionTree):", selected_features)


Selected features (DecisionTree): Index(['Unnamed: 0', 'name_length', 'is_organic', 'is_sugar_free',
       'quantity_value', 'quantity_unit_encoded', 'category_depth',
       'country_count', 'product_age_days', 'created_year', 'created_month',
       'main_category_encoded'],
      dtype='object')



Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.



In [35]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR

# Model va ustun sonlarini saqlash uchun bo'sh ro'yxat
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier()    
}

results = []

# X va y aniq ekanligini faraz qilamiz
for name, model in models.items():
    rfe = RFE(model, n_features_to_select=20)  # 20 ta ustun tanlaymiz misol uchun
    rfe.fit(X, y)
    selected_count = sum(rfe.support_)
    results.append({'Model': name, 'Selected Features Count': selected_count})

# Natijalarni DataFrame ko'rinishida chiqaramiz
results_df = pd.DataFrame(results)
print(results_df)



Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.


lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.



                 Model  Selected Features Count
0  Logistic Regression                       12
1        Random Forest                       12
2        Decision Tree                       12



Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.



In [36]:
import os

save_path = r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Feature_Selection"
os.makedirs(save_path, exist_ok=True)

for name, model in models.items():
    rfe = RFE(model, n_features_to_select=20)
    rfe.fit(X, y)

    selected_features = X.columns[rfe.support_]
    selected_df = X[selected_features]

    # 🔹 Har bir model uchun CSV saqlash
    file_path = os.path.join(save_path, f"{name}_RFE_Selected_Features.csv")
    selected_df.to_csv(file_path, index=False)

    print(f"✅ {name} features saved to: {file_path}")



Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.


lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.



✅ Logistic Regression features saved to: C:\Users\Rasulbek907\Desktop\Project_MP\Data\Feature_Selection\Logistic Regression_RFE_Selected_Features.csv
✅ Random Forest features saved to: C:\Users\Rasulbek907\Desktop\Project_MP\Data\Feature_Selection\Random Forest_RFE_Selected_Features.csv
✅ Decision Tree features saved to: C:\Users\Rasulbek907\Desktop\Project_MP\Data\Feature_Selection\Decision Tree_RFE_Selected_Features.csv



Found n_features_to_select=20 > n_features=12. There will be no feature selection and all features will be kept.



# Feature Selection Embedded + Lasso CV

In [44]:
from sklearn.linear_model import LassoCV
X = df.drop('nova_group', axis=1)
y = df['nova_group']
lasso = LassoCV(cv=10, random_state=42).fit(X, y)
importance = np.abs(lasso.coef_)
selected_features = X.columns[importance > 0]
print("Selected features using Lasso (non-zero coefficients):")
print(selected_features.tolist())

Selected features using Lasso (non-zero coefficients):
['Unnamed: 0', 'is_organic', 'quantity_value', 'quantity_unit_encoded', 'category_depth', 'country_count', 'product_age_days', 'created_year', 'created_month', 'main_category_encoded']


In [45]:
# Faqat muhim featurelarni saqlab qolish
percentile_threshold = np.percentile(importance, 90) 
top_features = X.columns[importance >= percentile_threshold]
print("\nTop 25% important features based on Lasso coefficients:")
print(top_features.tolist())


Top 25% important features based on Lasso coefficients:
['category_depth', 'created_year']


In [46]:
# Feature importance'ni jadval shaklida tayyorlash

lasso_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
})

# Importance qiymatlariga qarab kamayish tartibida (eng muhimdan kam muhimga qarab) saralaymiz
lasso_df = lasso_df.sort_values(by='Importance', ascending=False)
percentile_threshold = np.percentile(importance, 85)
lasso_df['Top15%'] = np.where(lasso_df['Importance'] >= percentile_threshold, 'Yes', 'No')
print("Feature Importance Table:")
print(lasso_df)

Feature Importance Table:
                  Feature  Importance Top15%
9            created_year    0.133114    Yes
6          category_depth    0.097554    Yes
2              is_organic    0.086377     No
5   quantity_unit_encoded    0.057092     No
7           country_count    0.039631     No
11  main_category_encoded    0.037960     No
4          quantity_value    0.009899     No
8        product_age_days    0.008308     No
10          created_month    0.004170     No
0              Unnamed: 0    0.000002     No
3           is_sugar_free    0.000000     No
1             name_length    0.000000     No


In [40]:
fig = px.bar(
    feat_df,                
    x='Feature',            
    y='Importance',         
    color='Top15%',         
    color_discrete_map={'Yes': 'red', 'No': 'blue'},  
    text='Importance',      
    title='Lasso Feature Importance'  
)


fig.update_layout(
    xaxis_tickangle=-45,  
    width=1000,           
    height=600            
)

fig.show()

# Feature Selection Embedded + Random Forest Regression 

In [41]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=42)

rf.fit(X, y)

importances = rf.feature_importances_
feat_df = pd.DataFrame({
    'Feature': X.columns,       
    'Importance': importances   
})
feat_df = feat_df.sort_values(by='Importance', ascending=False)
percentile_threshold = np.percentile(importances, 85)
feat_df['Top15%'] = np.where(feat_df['Importance'] >= percentile_threshold, 'Yes', 'No')
print("Random Forest Feature Importance Table:")
print(feat_df)

Random Forest Feature Importance Table:
                  Feature  Importance Top15%
8        product_age_days    0.153223    Yes
0              Unnamed: 0    0.149039    Yes
4          quantity_value    0.124060     No
11  main_category_encoded    0.123867     No
6          category_depth    0.100275     No
10          created_month    0.096256     No
1             name_length    0.077753     No
9            created_year    0.076834     No
7           country_count    0.057912     No
5   quantity_unit_encoded    0.034719     No
2              is_organic    0.006062     No
3           is_sugar_free    0.000000     No


In [42]:
# Plotly Express yordamida Random Forest feature importance'ni grafik ko'rinishida chiqaramiz
fig = px.bar(
    feat_df,                
    x='Feature',            
    y='Importance',         
    color='Top15%',         
    color_discrete_map={'Yes': 'red', 'No': 'blue'},  
    text='Importance',      
    title='Random Forest Feature Importance'  
)


fig.update_layout(
    xaxis_tickangle=-45,  
    width=1000,           
    height=600            
)

fig.show()

In [47]:
# 📁 CSV fayllarni saqlash
import os

save_path = r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Feature_Selection"
os.makedirs(save_path, exist_ok=True)

# Lasso natijalarini saqlash
lasso_df.to_csv(os.path.join(save_path, "Lasso_Feature_Importance.csv"), index=False)

# Random Forest natijalarini saqlash
feat_df.to_csv(os.path.join(save_path, "RandomForest_Feature_Importance.csv"), index=False)

print("✅ Barcha natijalar CSV fayllarga saqlandi.")


✅ Barcha natijalar CSV fayllarga saqlandi.
