In [1]:
import pandas as pd
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Project_MP\Log\feature_enginering.log"

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

try:
    logging.info("CSV fayl o'qilmoqda:...")
    df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Preprosessed\missing_value.csv")
    logging.info(f"Fayl muvaffaqiyatli o'qildi. Satırlar soni: {len(df)} ustunlar soni: {len(df.columns)}")
except Exception as e:
    logging.error(f"CSV faylni o'qishda xatolik: {e}")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       5000 non-null   int64  
 1   code             5000 non-null   int64  
 2   product_name     5000 non-null   object 
 3   brands           5000 non-null   object 
 4   categories       5000 non-null   object 
 5   countries        5000 non-null   object 
 6   quantity         5000 non-null   object 
 7   packaging        5000 non-null   object 
 8   nova_group       5000 non-null   float64
 9   created_t        5000 non-null   int64  
 10  last_modified_t  5000 non-null   int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 429.8+ KB


In [None]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder

# 🧹 Tozalash
df = df.drop(columns=["Unnamed: 0", "code"], errors="ignore") # Keraksiz ustunlarni olib tashlash:
df = df.replace(["unknown", "n/a", "None"], pd.NA) # “unknown”, “n/a”, “None” kabi qiymatlarni NaN (pd.NA) ga almashtiradi.
df = df.dropna(subset=["product_name", "categories", "brands", "nova_group"]) # Asosiy ustunlarda bo‘sh qiymatlar (NaN) bo‘lgan satrlarni olib tashlash.
 
# 🧠 product_name
df["name_length"] = df["product_name"].apply(lambda x: len(str(x).split())) # Mahsulot nomidagi so‘zlar soni.
df["is_organic"] = df["product_name"].str.contains("organic", case=False, na=False).astype(int) # Mahsulot “organic” so‘zini o‘z ichiga oladimi? 1 Ha 0 yoq
df["is_sugar_free"] = df["product_name"].str.contains("sugar free", case=False, na=False).astype(int) # Mahsulot “sugar free” (shakarsiz) degan iborani o‘z ichiga oladimi? 1 ha 0 yo`q
 
# 📦 quantity (raqam + birlik) 
def extract_quantity(q):  # quantity ustunidan raqam (miqdor) va birlik (g, ml, l …) ni ajratadi. # Masalan: "250g" → (250.0, "g")
    if isinstance(q, str):
        num = re.findall(r"\d+\.?\d*", q)
        unit = re.findall(r"[a-zA-Z]+", q)
        return float(num[0]) if num else None, unit[0].lower() if unit else None
    return None, None

df[["quantity_value", "quantity_unit"]] = df["quantity"].apply(lambda x: pd.Series(extract_quantity(x))) # Natijani ikki yangi ustunga joylaydi: quantity_value → raqam (masalan, 250) , quantity_unit → birlik (masalan, "g")

# 🏷️ categories
df["main_category"] = df["categories"].apply(lambda x: str(x).split(",")[0].lower() if pd.notna(x) else "") # Asosiy kategoriya ajratiladi: "Snacks, Sweet biscuits, Chocolate biscuits" → 3
df["category_depth"] = df["categories"].apply(lambda x: len(str(x).split(","))) 

# 🌍 countries
df["country_count"] = df["countries"].apply(lambda x: len(str(x).split(","))) # # Mahsulot nechta mamlakatda mavjudligini hisoblaydi.
 
# 🕓 vaqt # Vaqt ustunlarini to‘g‘ri datetime formatga o‘tkazadi.
df["created_t"] = pd.to_datetime(df["created_t"], unit="s", errors="coerce")
df["last_modified_t"] = pd.to_datetime(df["last_modified_t"], unit="s", errors="coerce")
df["product_age_days"] = (df["last_modified_t"] - df["created_t"]).dt.days # Mahsulot ma’lumotining “yoshi”ni (kunlarda) hisoblaydi.
df["created_year"] = df["created_t"].dt.year # Yaratilgan yil va oy ajratiladi.
df["created_month"] = df["created_t"].dt.month

# 🔢 Kategorik ustunlarni kodlash 
le1 = LabelEncoder()
le2 = LabelEncoder()
df["main_category_encoded"] = le1.fit_transform(df["main_category"])  # Matnli kategoriyalarni raqamga aylantirish:  "snacks" → 0, "drinks" → 1, ...
df["quantity_unit_encoded"] = le2.fit_transform(df["quantity_unit"].astype(str))

# 🔚 Faqat kerakli ustunlarni tanlash # Model uchun kerakli ustunlarni tanlaydi
final_df = df[[
    "name_length", "is_organic", "is_sugar_free",
    "quantity_value", "quantity_unit_encoded",
    "category_depth", "country_count", "product_age_days",
    "created_year", "created_month", "main_category_encoded",
    "nova_group"
]].dropna()

# 🏁 Natija: multi-class dataset
print(final_df.head())
print(final_df["nova_group"].value_counts()) #nova_group sinflarining sonini ko‘rsatadi

import os

# 🎯 Saqlash yo‘li
save_path = r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Enginered_Data"

# 📁 Papka mavjudligini tekshirish va yaratish (agar kerak bo‘lsa)
os.makedirs(save_path, exist_ok=True)

# 📄 To‘liq fayl yo‘li
file_path = os.path.join(save_path, "sml_multiclass_dataset.csv")

# 💾 CSV faylni saqlash
final_df.to_csv(file_path, index=False)

print(f"✅ Fayl muvaffaqiyatli saqlandi: {file_path}")

   name_length  is_organic  is_sugar_free  quantity_value  \
0            2           0              0            33.0   
1            1           0              0           100.0   
2            2           0              0             2.0   
3            3           0              0             1.0   
4            1           0              0            33.0   

   quantity_unit_encoded  category_depth  country_count  product_age_days  \
0                      7               7              1              3671   
1                     14               1              2              3326   
2                     29               6              1              2590   
3                     29               7              1              4068   
4                      7               6              1              2365   

   created_year  created_month  main_category_encoded  nova_group  
0          2015              8                     53     3.28451  
1          2016              9    

In [4]:
df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Enginered_Data\sml_multiclass_dataset.csv")

In [5]:
df.head()

Unnamed: 0,name_length,is_organic,is_sugar_free,quantity_value,quantity_unit_encoded,category_depth,country_count,product_age_days,created_year,created_month,main_category_encoded,nova_group
0,2,0,0,33.0,7,7,1,3671,2015,8,53,3.28451
1,1,0,0,100.0,14,1,2,3326,2016,9,459,3.0
2,2,0,0,2.0,29,6,1,2590,2018,9,53,1.0
3,3,0,0,1.0,29,7,1,4068,2014,9,53,1.0
4,1,0,0,33.0,7,6,1,2365,2019,3,68,3.28451


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4993 entries, 0 to 4992
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name_length            4993 non-null   int64  
 1   is_organic             4993 non-null   int64  
 2   is_sugar_free          4993 non-null   int64  
 3   quantity_value         4993 non-null   float64
 4   quantity_unit_encoded  4993 non-null   int64  
 5   category_depth         4993 non-null   int64  
 6   country_count          4993 non-null   int64  
 7   product_age_days       4993 non-null   int64  
 8   created_year           4993 non-null   int64  
 9   created_month          4993 non-null   int64  
 10  main_category_encoded  4993 non-null   int64  
 11  nova_group             4993 non-null   float64
dtypes: float64(2), int64(10)
memory usage: 468.2 KB
