In [1]:
import pandas as pd
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Project_8\Log\feature_selection.log"

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

try:
    logging.info("CSV fayl o'qilmoqda:...")
    df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_8\Data\Preprocessing\feature_km.csv")
    logging.info(f"Fayl muvaffaqiyatli o'qildi. SatÄ±rlar soni: {len(df)} ustunlar soni: {len(df.columns)}")
except Exception as e:
    logging.error(f"CSV faylni o'qishda xatolik: {e}")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Unnamed: 0.1            371528 non-null  int64  
 1   Unnamed: 0              371528 non-null  int64  
 2   index                   371528 non-null  float64
 3   dateCrawled             371528 non-null  float64
 4   name                    371528 non-null  float64
 5   price                   371528 non-null  float64
 6   vehicleType             371528 non-null  float64
 7   yearOfRegistration      371528 non-null  float64
 8   powerPS                 371528 non-null  float64
 9   model                   371528 non-null  float64
 10  monthOfRegistration     371528 non-null  float64
 11  fuelType                371528 non-null  float64
 12  brand                   371528 non-null  float64
 13  dateCreated             371528 non-null  float64
 14  nrOfPictures        

In [4]:
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)

In [5]:
import pandas as pd
import plotly.express as px

# Raqamli ustunlarni tanlash
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# faqat raqamli ustunlar bo'yicha korrelyatsiya
corr_matrix = df[numeric_cols].corr()

# kilometer_cont ustunining boshqa ustunlar bilan korrelyatsiyasi
life_span_corr = corr_matrix[['kilometer_cont']].sort_values(by='kilometer_cont', ascending=False)

# Heatmap uchun DataFrame ni transformatsiya qilish
heatmap_data = life_span_corr.reset_index()
heatmap_data.columns = ['Feature', 'Correlation_with_life_span']

# Plotly Express yordamida bar grafik
fig = px.bar(
    heatmap_data,
    x='Feature',
    y='Correlation_with_life_span',
    title='kilometer_cont bilan boshqa raqamli ustunlar o\'rtasidagi korrelyatsiya',
    labels={'Correlation_with_life_span': 'Korrelyatsiya qiymati', 'Feature': 'Xususiyatlar'}
)

fig.show()

In [6]:
import numpy as np
import plotly.figure_factory as ff

# Qiymatlarni 2 xonagacha qisqartirish
annot_text = np.around(corr_matrix.values, decimals=2).astype(str)

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=annot_text,
    colorscale='Viridis',
    showscale=True
)

fig.update_layout(title='Raqamli ustunlar o\'rtasidagi korrelyatsiya matritsasi')
fig.show()

# Feature Selection Filtering 

In [7]:
import seaborn as sns               
import matplotlib.pyplot as plt     


corr_matrix = df.corr().abs()  

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  


to_drop = [col for col in upper.columns if any(upper[col] > 0.8)]  
corr_matrix = df.corr().abs() 
print(corr_matrix)  

                           index  dateCrawled      name     price  \
index                   1.000000     0.002510  0.000960  0.000696   
dateCrawled             0.002510     1.000000  0.000127  0.001613   
name                    0.000960     0.000127  1.000000  0.000527   
price                   0.000696     0.001613  0.000527  1.000000   
vehicleType             0.001183     0.009478  0.110849  0.004497   
yearOfRegistration      0.000060     0.000173  0.002521  0.000182   
powerPS                 0.000769     0.003991  0.080771  0.006850   
model                   0.000234     0.000951  0.444069  0.000203   
monthOfRegistration     0.000914     0.003678  0.017171  0.002826   
fuelType                0.001629     0.010140  0.043767  0.002527   
brand                   0.001542     0.001530  0.810001  0.001440   
dateCreated             0.002240     0.988862  0.000200  0.001509   
nrOfPictures                 NaN          NaN       NaN       NaN   
postalCode              0.000038  

In [8]:

import numpy as np   

corr_matrix = df.corr().abs()  

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))  

to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]  

df_filtered = df.drop(columns=to_drop)  
print("\nOriginal shape:", df.shape)  

print("Shape after dropping correlated features:", df_filtered.shape) 
print("Columns dropped : ", to_drop)


Original shape: (371528, 26)
Shape after dropping correlated features: (371528, 20)
Columns dropped :  ['dateCreated', 'seller_privat', 'offerType_Gesuch', 'abtest_test', 'gearbox_manuell', 'notRepairedDamage_nein']


In [9]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   index                 371528 non-null  float64
 1   dateCrawled           371528 non-null  float64
 2   name                  371528 non-null  float64
 3   price                 371528 non-null  float64
 4   vehicleType           371528 non-null  float64
 5   yearOfRegistration    371528 non-null  float64
 6   powerPS               371528 non-null  float64
 7   model                 371528 non-null  float64
 8   monthOfRegistration   371528 non-null  float64
 9   fuelType              371528 non-null  float64
 10  brand                 371528 non-null  float64
 11  nrOfPictures          371528 non-null  float64
 12  postalCode            371528 non-null  float64
 13  lastSeen              371528 non-null  float64
 14  seller_gewerblich     371528 non-null  float64
 15  

In [10]:
# Yuqori korrelyatsiyaga ega juftliklarni saqlash uchun boâsh roâyxat yaratamiz
high_corr_pairs = []


for col in upper.columns:
    
    for row in upper.index:
        
        if upper.loc[row, col] is not np.nan and upper.loc[row, col] > 0.95:
            
            high_corr_pairs.append([row, col, round(upper.loc[row, col], 2)])


corr_table = pd.DataFrame(high_corr_pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])

print("Highly correlated feature pairs (correlation > 0.95):")
print(corr_table)

Highly correlated feature pairs (correlation > 0.95):
              Feature 1               Feature 2  Correlation
0           dateCrawled             dateCreated         0.99
1     seller_gewerblich           seller_privat         1.00
2     offerType_Angebot        offerType_Gesuch         1.00
3        abtest_control             abtest_test         1.00
4     gearbox_automatik         gearbox_manuell         1.00
5  notRepairedDamage_ja  notRepairedDamage_nein         1.00


In [11]:
import plotly.express as px  

corr_long = corr_matrix.reset_index().melt(id_vars='index')  

corr_long.columns = ['Feature 1', 'Feature 2', 'Correlation']  

fig = px.imshow(
    corr_matrix,                  
    text_auto='.2f',              
    aspect="auto",                
    color_continuous_scale='RdBu_r',  
    zmin=-1, zmax=1,             
    title="Correlation Matrix Heatmap (Interactive)"  
)


fig.update_layout(
    width=900,          
    height=800,          
    xaxis_title="Features",  
    yaxis_title="Features"   
)

fig.show()

# Low Variance

In [12]:
from sklearn.feature_selection import VarianceThreshold  
threshold = 0.01  
selector = VarianceThreshold(threshold=threshold)  
selector.fit(df_filtered)  
low_variance_features = df_filtered.columns[~selector.get_support()] 
df_low_variance_filtered = df_filtered.drop(columns=low_variance_features)  
print("\nOriginal shape:", df_filtered.shape)  
print("Shape after dropping low-variance features:", df_low_variance_filtered.shape)  
print("Columns low varianced: ",low_variance_features )


Original shape: (371528, 20)
Shape after dropping low-variance features: (371528, 14)
Columns low varianced:  Index(['price', 'yearOfRegistration', 'powerPS', 'nrOfPictures',
       'seller_gewerblich', 'offerType_Angebot'],
      dtype='object')


In [13]:
df_low_variance_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   index                 371528 non-null  float64
 1   dateCrawled           371528 non-null  float64
 2   name                  371528 non-null  float64
 3   vehicleType           371528 non-null  float64
 4   model                 371528 non-null  float64
 5   monthOfRegistration   371528 non-null  float64
 6   fuelType              371528 non-null  float64
 7   brand                 371528 non-null  float64
 8   postalCode            371528 non-null  float64
 9   lastSeen              371528 non-null  float64
 10  abtest_control        371528 non-null  float64
 11  gearbox_automatik     371528 non-null  float64
 12  notRepairedDamage_ja  371528 non-null  float64
 13  kilometer_cont        371528 non-null  float64
dtypes: float64(14)
memory usage: 39.7 MB


In [14]:
df_low_variance_filtered.to_csv(r"C:\Users\Rasulbek907\Desktop\Project_8\Data\Feature_Selection\Filtered_Features.csv")

In [2]:
from IPython.display import display, HTML

display(HTML("""
<a href="file:///C:/Users/Rasulbek907/Desktop/Project_8/Data/Feature_Selection/Filtered_Features.csv" target="_blank" style="
    display: inline-block;
    padding: 10px 20px;
    font-size: 16px;
    color: white;
    background-color: #FF5722;
    text-align: center;
    text-decoration: none;
    border-radius: 5px;
">
Open Filtered Features CSV
</a>
"""))
