In [1]:
import pandas as pd
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Project_8\Log\feature_selection.log"

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

try:
    logging.info("CSV fayl o'qilmoqda:...")
    df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_8\Data\Preprocessing\feature_enginered.csv")
    logging.info(f"Fayl muvaffaqiyatli o'qildi. SatÄ±rlar soni: {len(df)} ustunlar soni: {len(df.columns)}")
except Exception as e:
    logging.error(f"CSV faylni o'qishda xatolik: {e}")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322278 entries, 0 to 322277
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0              322278 non-null  int64  
 1   index                   322278 non-null  float64
 2   dateCrawled             322278 non-null  float64
 3   name                    322278 non-null  float64
 4   price                   322278 non-null  float64
 5   vehicleType             322278 non-null  float64
 6   yearOfRegistration      322278 non-null  float64
 7   powerPS                 322278 non-null  float64
 8   model                   322278 non-null  float64
 9   monthOfRegistration     322278 non-null  float64
 10  fuelType                322278 non-null  float64
 11  brand                   322278 non-null  float64
 12  dateCreated             322278 non-null  float64
 13  nrOfPictures            322278 non-null  float64
 14  postalCode          

In [3]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
import pandas as pd
import plotly.express as px

# Raqamli ustunlarni tanlash
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# faqat raqamli ustunlar bo'yicha korrelyatsiya
corr_matrix = df[numeric_cols].corr()

# kilometer_cont ustunining boshqa ustunlar bilan korrelyatsiyasi
life_span_corr = corr_matrix[['kilometer_cont']].sort_values(by='kilometer_cont', ascending=False)

# Heatmap uchun DataFrame ni transformatsiya qilish
heatmap_data = life_span_corr.reset_index()
heatmap_data.columns = ['Feature', 'Correlation_with_life_span']

# Plotly Express yordamida bar grafik
fig = px.bar(
    heatmap_data,
    x='Feature',
    y='Correlation_with_life_span',
    title='kilometer_cont bilan boshqa raqamli ustunlar o\'rtasidagi korrelyatsiya',
    labels={'Correlation_with_life_span': 'Korrelyatsiya qiymati', 'Feature': 'Xususiyatlar'}
)

fig.show()

In [6]:
import numpy as np
import plotly.figure_factory as ff

# Qiymatlarni 2 xonagacha qisqartirish
annot_text = np.around(corr_matrix.values, decimals=2).astype(str)

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=annot_text,
    colorscale='Viridis',
    showscale=True
)

fig.update_layout(title='Raqamli ustunlar o\'rtasidagi korrelyatsiya matritsasi')
fig.show()

# Feature Selection Filtering 

In [5]:
import numpy as np 
import seaborn as sns               
import matplotlib.pyplot as plt     


corr_matrix = df.corr().abs()  

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  


to_drop = [col for col in upper.columns if any(upper[col] > 0.8)]  
corr_matrix = df.corr().abs() 
print(corr_matrix)  

                           index  dateCrawled      name     price  \
index                   1.000000     0.003611  0.001367  0.000938   
dateCrawled             0.003611     1.000000  0.001410  0.001902   
name                    0.001367     0.001410  1.000000  0.000720   
price                   0.000938     0.001902  0.000720  1.000000   
vehicleType             0.001191     0.008896  0.118520  0.004674   
yearOfRegistration      0.001851     0.002765  0.015939  0.001436   
powerPS                 0.000290     0.006655  0.234178  0.020742   
model                   0.000435     0.001997  0.462298  0.000158   
monthOfRegistration     0.000543     0.003434  0.010171  0.002531   
fuelType                0.001149     0.008592  0.042324  0.002455   
brand                   0.001527     0.000273  0.837860  0.001013   
dateCreated             0.003293     0.988831  0.001266  0.001795   
nrOfPictures                 NaN          NaN       NaN       NaN   
postalCode              0.000414  

In [6]:

import numpy as np   

corr_matrix = df.corr().abs()  

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  

to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]  

df_filtered = df.drop(columns=to_drop)  
print("\nOriginal shape:", df.shape)  

print("Shape after dropping correlated features:", df_filtered.shape) 
print("Columns dropped : ", to_drop)


Original shape: (322278, 26)
Shape after dropping correlated features: (322278, 20)
Columns dropped :  ['dateCreated', 'seller_privat', 'offerType_Gesuch', 'abtest_test', 'gearbox_manuell', 'notRepairedDamage_nein']


In [7]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322278 entries, 0 to 322277
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   index                 322278 non-null  float64
 1   dateCrawled           322278 non-null  float64
 2   name                  322278 non-null  float64
 3   price                 322278 non-null  float64
 4   vehicleType           322278 non-null  float64
 5   yearOfRegistration    322278 non-null  float64
 6   powerPS               322278 non-null  float64
 7   model                 322278 non-null  float64
 8   monthOfRegistration   322278 non-null  float64
 9   fuelType              322278 non-null  float64
 10  brand                 322278 non-null  float64
 11  nrOfPictures          322278 non-null  float64
 12  postalCode            322278 non-null  float64
 13  lastSeen              322278 non-null  float64
 14  seller_gewerblich     322278 non-null  float64
 15  

In [8]:
# Yuqori korrelyatsiyaga ega juftliklarni saqlash uchun boâsh roâyxat yaratamiz
high_corr_pairs = []


for col in upper.columns:
    
    for row in upper.index:
        
        if upper.loc[row, col] is not np.nan and upper.loc[row, col] > 0.95:
            
            high_corr_pairs.append([row, col, round(upper.loc[row, col], 2)])


corr_table = pd.DataFrame(high_corr_pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])

print("Highly correlated feature pairs (correlation > 0.95):")
print(corr_table)

Highly correlated feature pairs (correlation > 0.95):
              Feature 1               Feature 2  Correlation
0           dateCrawled             dateCreated         0.99
1     seller_gewerblich           seller_privat         1.00
2     offerType_Angebot        offerType_Gesuch         1.00
3        abtest_control             abtest_test         1.00
4     gearbox_automatik         gearbox_manuell         1.00
5  notRepairedDamage_ja  notRepairedDamage_nein         1.00


In [9]:
import plotly.express as px  

corr_long = corr_matrix.reset_index().melt(id_vars='index')  

corr_long.columns = ['Feature 1', 'Feature 2', 'Correlation']  

fig = px.imshow(
    corr_matrix,                  
    text_auto='.2f',              
    aspect="auto",                
    color_continuous_scale='RdBu_r',  
    zmin=-1, zmax=1,             
    title="Correlation Matrix Heatmap (Interactive)"  
)


fig.update_layout(
    width=900,          
    height=800,          
    xaxis_title="Features",  
    yaxis_title="Features"   
)

fig.show()

# Low Variance

In [10]:
from sklearn.feature_selection import VarianceThreshold  
threshold = 0.01  
selector = VarianceThreshold(threshold=threshold)  
selector.fit(df_filtered)  
low_variance_features = df_filtered.columns[~selector.get_support()] 
df_low_variance_filtered = df_filtered.drop(columns=low_variance_features)  
print("\nOriginal shape:", df_filtered.shape)  
print("Shape after dropping low-variance features:", df_low_variance_filtered.shape)  
print("Columns low varianced: ",low_variance_features )


Original shape: (322278, 20)
Shape after dropping low-variance features: (322278, 15)
Columns low varianced:  Index(['price', 'powerPS', 'nrOfPictures', 'seller_gewerblich',
       'offerType_Angebot'],
      dtype='object')


In [11]:
df_low_variance_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322278 entries, 0 to 322277
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   index                 322278 non-null  float64
 1   dateCrawled           322278 non-null  float64
 2   name                  322278 non-null  float64
 3   vehicleType           322278 non-null  float64
 4   yearOfRegistration    322278 non-null  float64
 5   model                 322278 non-null  float64
 6   monthOfRegistration   322278 non-null  float64
 7   fuelType              322278 non-null  float64
 8   brand                 322278 non-null  float64
 9   postalCode            322278 non-null  float64
 10  lastSeen              322278 non-null  float64
 11  abtest_control        322278 non-null  float64
 12  gearbox_automatik     322278 non-null  float64
 13  notRepairedDamage_ja  322278 non-null  float64
 14  kilometer_cont        322278 non-null  float64
dtype

In [12]:
df_low_variance_filtered.to_csv(r"C:\Users\Rasulbek907\Desktop\Project_8\Data\Feature_Selection\Filtered_Features.csv")

In [2]:
from IPython.display import display, HTML

display(HTML("""
<a href="file:///C:/Users/Rasulbek907/Desktop/Project_8/Data/Feature_Selection/Filtered_Features.csv" target="_blank" style="
    display: inline-block;
    padding: 10px 20px;
    font-size: 16px;
    color: white;
    background-color: #FF5722;
    text-align: center;
    text-decoration: none;
    border-radius: 5px;
">
Open Filtered Features CSV
</a>
"""))
