# Food Health Classifier

# “Mahsulot haqida ma’lumotlardan foydalanib, u qanchalik qayta ishlangan ekanini oldindan aytadigan model” qurish.

# Target Ustuni  = nova_group , nova_group 1–4 oralig‘ida 1: minimal qayta ishlangan (sog‘lom) , 4: yuqori qayta ishlangan (kamroq sog‘lom)

In [1]:
import pandas as pd
import logging

# Log fayl manzili
log_path = r"C:\Users\Rasulbek907\Desktop\Project_MP\Log\data_loader.log"

# Log sozlamalari
logging.basicConfig(
    filename=log_path,
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

try:
    logging.info("CSV fayl o'qilmoqda:...")
    df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Enginered_Data\sml_multiclass_dataset.csv")
    logging.info(f"Fayl muvaffaqiyatli o'qildi. Satırlar soni: {len(df)} ustunlar soni: {len(df.columns)}")
except Exception as e:
    logging.error(f"CSV faylni o'qishda xatolik: {e}")

In [2]:
df.head()

Unnamed: 0,name_length,is_organic,is_sugar_free,quantity_value,quantity_unit_encoded,category_depth,country_count,product_age_days,created_year,created_month,main_category_encoded,nova_group
0,2,0,0,33.0,7,7,1,3671,2015,8,53,3.28451
1,1,0,0,100.0,14,1,2,3326,2016,9,459,3.0
2,2,0,0,2.0,29,6,1,2590,2018,9,53,1.0
3,3,0,0,1.0,29,7,1,4068,2014,9,53,1.0
4,1,0,0,33.0,7,6,1,2365,2019,3,68,3.28451


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4993 entries, 0 to 4992
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name_length            4993 non-null   int64  
 1   is_organic             4993 non-null   int64  
 2   is_sugar_free          4993 non-null   int64  
 3   quantity_value         4993 non-null   float64
 4   quantity_unit_encoded  4993 non-null   int64  
 5   category_depth         4993 non-null   int64  
 6   country_count          4993 non-null   int64  
 7   product_age_days       4993 non-null   int64  
 8   created_year           4993 non-null   int64  
 9   created_month          4993 non-null   int64  
 10  main_category_encoded  4993 non-null   int64  
 11  nova_group             4993 non-null   float64
dtypes: float64(2), int64(10)
memory usage: 468.2 KB


# 🧾 Jadval ustunlari tavsifi

| №    | Ustun nomi                | Tarjima                              | Qisqacha ma’nosi                                                                                   |
| ---- | ------------------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------- |
| 1️⃣  | **name_length**           | Nomi uzunligi                        | Mahsulot nomidagi belgilar soni (nom uzunligini bildiradi).                                        |
| 2️⃣  | **is_organic**            | Organik mahsulotmi                   | 1 — organik, 0 — oddiy mahsulot.                                                                   |
| 3️⃣  | **is_sugar_free**         | Shakarsizmi                          | 1 — shakarsiz, 0 — shakarlangan.                                                                   |
| 4️⃣  | **quantity_value**        | Miqdor qiymati                       | Mahsulot og‘irligi yoki hajmi (masalan, 100 ml, 250 g).                                            |
| 5️⃣  | **quantity_unit_encoded** | Miqdor birligi (kodlangan)           | Miqdor o‘lchov birligi (gramm, litr, dona va hokazo) raqam bilan kodlangan.                        |
| 6️⃣  | **category_depth**        | Kategoriya chuqurligi                | Mahsulot qaysi darajadagi kategoriya ichida joylashganini ko‘rsatadi (chuqurroq → aniqroq toifa).  |
| 7️⃣  | **country_count**         | Mamlakatlar soni                     | Mahsulot nechta mamlakatda mavjud yoki sotilayotganini bildiradi.                                  |
| 8️⃣  | **product_age_days**      | Mahsulot yoshi (kunlarda)            | Mahsulot bazaga kiritilgan kundan boshlab o‘tgan kunlar soni.                                      |
| 9️⃣  | **created_year**          | Yaratilgan yil                       | Mahsulot ma’lumotlari bazaga qo‘shilgan yil.                                                       |
| 🔟   | **created_month**         | Yaratilgan oy                        | Ma’lumot kiritilgan oy raqami (1–12).                                                              |
| 11️⃣ | **main_category_encoded** | Asosiy kategoriya (kodlangan)        | Mahsulotning asosiy turini ifodalovchi kod (masalan: ichimliklar, shirinliklar, sut mahsulotlari). |
| 12️⃣ | **nova_group**            | NOVA guruhi (qayta ishlash darajasi) | Mahsulot qayta ishlanganlik darajasi: <br> 1 — tabiiy, 4 — kuchli qayta ishlangan.                 |


In [4]:
df.describe()

Unnamed: 0,name_length,is_organic,is_sugar_free,quantity_value,quantity_unit_encoded,category_depth,country_count,product_age_days,created_year,created_month,main_category_encoded,nova_group
count,4993.0,4993.0,4993.0,4993.0,4993.0,4993.0,4993.0,4993.0,4993.0,4993.0,4993.0,4993.0
mean,3.47126,0.01462,0.0,294.821727,19.069497,7.237332,2.443421,2494.868816,2018.152614,6.031444,262.622471,3.283936
std,2.12689,0.12004,0.0,230.069482,11.915155,3.399359,2.45228,1232.022279,3.275715,3.332654,174.267565,0.974557
min,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2012.0,1.0,0.0,1.0
25%,2.0,0.0,0.0,110.0,14.0,5.0,1.0,1543.0,2016.0,3.0,75.0,3.0
50%,3.0,0.0,0.0,250.0,14.0,7.0,2.0,2499.0,2018.0,6.0,344.0,4.0
75%,5.0,0.0,0.0,500.0,14.0,10.0,3.0,3452.0,2021.0,9.0,415.0,4.0
max,18.0,1.0,0.0,2041.165665,71.0,25.0,25.0,4991.0,2025.0,12.0,519.0,4.0


# Asosiy ustunlar bo‘yicha statistik ko‘rsatkichlar

| Ustun nomi                | O‘rtacha qiymat (mean) | Eng kichik (min) | Eng katta (max) | Izoh                                                                                            |
| ------------------------- | ---------------------- | ---------------- | --------------- | ----------------------------------------------------------------------------------------------- |
| **name_length**           | 3.47                   | 1                | 18              | O‘rtacha nom uzunligi 3–4 so‘z atrofida, lekin ayrim mahsulot nomlari juda uzun (18 so‘zgacha). |
| **is_organic**            | 0.0146                 | 0                | 1               | Faqat ~1.5% mahsulotlar **organik**, bu juda kam.                                               |
| **is_sugar_free**         | 0.0                    | 0                | 0               | Hech bir mahsulot **shakarsiz** deb belgilanmagan (yoki ma’lumot yo‘q).                         |
| **quantity_value**        | 294.8                  | 0                | 2041            | O‘rtacha mahsulot miqdori 300 birlik atrofida, lekin juda katta farq mavjud.                    |
| **quantity_unit_encoded** | 19.06                  | 1                | 71              | Miqdor birliklari (ml, g, kg, L va hok.) keng diapazonda kodlangan.                             |
| **category_depth**        | 7.23                   | 1                | 25              | Kategoriyalar 1 dan 25 gacha chuqurlikda, ya’ni murakkab ierarxik tuzilma bor.                  |
| **country_count**         | 2.43                   | 1                | 25              | Mahsulotlar o‘rtacha 2–3 ta mamlakatda mavjud, ayrimlari 25 ta davlatda.                        |
| **product_age_days**      | 2494                   | 1                | 4991            | Mahsulotlar o‘rtacha 6–7 yillik (ehtimol, platformaga qo‘shilgan vaqtdan beri).                 |
| **created_year**          | 2018.15                | 2012             | 2025            | Ko‘pchilik mahsulotlar 2016–2021 oralig‘ida yaratilgan.                                         |
| **created_month**         | 6.03                   | 1                | 12              | Yil bo‘yicha teng taqsimlangan, o‘rtacha 6-oy (iyun) atrofida.                                  |
| **main_category_encoded** | 262.6                  | 0                | 519             | 500 ga yaqin asosiy kategoriya mavjud, turli xil mahsulot turlari.                              |
| **nova_group**            | 3.28                   | 1                | 4               | Ko‘pchilik mahsulotlar **3 yoki 4-darajali qayta ishlangan oziq-ovqat** hisoblanadi.            |


# Kuzatishlar va xulosalar

🥦 Organik mahsulotlar juda kam (1–2%), bu ma’lumotlar bazasida sog‘lom oziq-ovqatlar ulushi pastligini ko‘rsatadi.

🍬 Shakarsiz mahsulotlar yo‘q, bu ustun ehtimol to‘ldirilmagan yoki barchasi “False”.

⚖️ Miqdor (quantity_value) juda farq qiladi (0 dan 2000+ gacha), bu normalizatsiya yoki log-transformatsiya talab qiladi.

🌍 Country count ko‘rsatadi: ko‘pchilik mahsulotlar faqat 1–3 ta mamlakatda sotiladi, ayrimlari esa juda ko‘p joyda.

📅 created_year va product_age_days ustunlari vaqt bo‘yicha o‘zgarishlarni (trend) tahlil qilish uchun juda qulay.

🍔 nova_group qiymatlarining yuqoriligi shuni ko‘rsatadiki, ko‘pchilik mahsulotlar ko‘p qayta ishlangan (sog‘lom emas).

🗂️ main_category_encoded da 500 ga yaqin tur mavjud — bu multiclass muammo uchun yaxshi asos.

In [5]:
df["nova_group"].value_counts(normalize=True) * 100

nova_group
4.00000    52.553575
3.00000    22.010815
1.00000    11.355898
3.28451     9.833767
2.00000     4.245944
Name: proportion, dtype: float64

# 📊 NOVA guruhi tahlili

| NOVA guruhi | Tushuntirish                                                                             | Ulushi (%) |
| ----------- | ---------------------------------------------------------------------------------------- | ---------- |
| 4           | Juda ko‘p qayta ishlangan mahsulotlar *(ultra-processed foods)*                          | **52.55%** |
| 3           | Qayta ishlangan mahsulotlar *(processed foods)*                                          | **22.01%** |
| 1           | Qayta ishlanmagan yoki minimal ishlangan mahsulotlar *(unprocessed/minimally processed)* | **11.36%** |
| 3.28451     | Noaniq yoki aralash guruh (ehtimol noto‘g‘ri yoki interpolatsiya qiymat)                 | **9.83%**  |
| 2           | Pishirishda ishlatiladigan ingredientlar *(culinary ingredients)*                        | **4.25%**  |


# Korrelatsiya jadvali

In [2]:
import plotly.express as px
import pandas as pd

corr_matrix = df.corr()

fig = px.imshow(
    corr_matrix,
    text_auto=".2f",
    color_continuous_scale="RdBu_r",
    title="🔗 Korrelatsiya matritsasi (Plotly)",
    aspect="auto"
)

fig.update_layout(
    width=900,
    height=600,
    title_font=dict(size=22, color="black"),
    coloraxis_colorbar=dict(title="Korrelatsiya")
)

fig.show()


# Target taqsimoti (nova_group)

In [4]:
import plotly.express as px

fig = px.histogram(
    data_frame=df,
    x="nova_group",
    color="nova_group",  # ranglarni har sinf uchun ajratish
    text_auto=True,      # ustida sonlarni ko‘rsatish
    title="🎯 Nova Group sinflar soni",
)

fig.update_layout(
    width=800,
    height=500,
    title_font=dict(size=22, color="black"),
    xaxis_title="NOVA guruhi",
    yaxis_title="Mahsulotlar soni",
    showlegend=False
)

fig.show()


# Pairplot (feature–feature bog‘liqligi)

In [10]:
import plotly.express as px

# 500 ta tasodifiy namunani olish
sample_df = df.sample(500, random_state=42)

# Plotly scatter matrix (pairplot ekvivalenti)
fig = px.scatter_matrix(
    sample_df,
    dimensions=sample_df.select_dtypes(include=['int64', 'float64']).columns,  # faqat raqamli ustunlar
    color="nova_group",
    title="🎯 Pairplot (Plotly) — Nova Group bo‘yicha",
    height=900,
    width=900,
    color_continuous_scale="RdBu_r"
)

fig.update_traces(diagonal_visible=False, marker=dict(size=4, opacity=0.7))
fig.update_layout(title_font=dict(size=22, color="black"))

fig.show()


# Product trend — created_year vs nova_group (yillik tendensiyalar)

In [11]:
import pandas as pd
import plotly.express as px

# df — sizning final_df DataFrame'ingiz
# Agar CSV dan o'qilsa:
# df = pd.read_csv(r"C:\Users\Rasulbek907\Desktop\Project_MP\Data\Enginered_Data\sml_multiclass_dataset.csv")

# 1. Yillik count — har bir nova_group uchun
year_counts = df.groupby(["created_year", "nova_group"]).size().reset_index(name="count")

fig = px.line(year_counts.sort_values("created_year"),
              x="created_year", y="count", color="nova_group",
              markers=True,
              title="Yillik taqsimot: har bir Nova Group bo'yicha mahsulotlar soni")
fig.update_layout(xaxis=dict(dtick=1))
fig.show()

# ➤ Agar stack qilingan bar ko'rinishini xohlasangiz:
fig2 = px.bar(year_counts, x="created_year", y="count", color="nova_group",
              title="Yillik taqsimot (stacked) — Nova Group")
fig2.update_layout(barmode="stack", xaxis=dict(dtick=1))
fig2.show()


# Healthy vs Unhealthy — is_organic, is_sugar_free bilan nova_group taqqosi

In [12]:
# 2a: is_organic foizlari — nova_group bo'yicha
org = df.groupby("nova_group")["is_organic"].mean().reset_index()
org["pct_organic"] = org["is_organic"] * 100

fig = px.bar(org, x="nova_group", y="pct_organic",
             title="Nova Group bo'yicha organik mahsulotlar foizi (%)",
             labels={"pct_organic":"% Organik", "nova_group":"Nova Group"})
fig.show()

# 2b: is_sugar_free — agar hammasi 0 bo'lsa, grafik bo'sh chiqadi; lekin umumiy taqsimotni ko'rsatamiz
sugar = df.groupby("nova_group")["is_sugar_free"].sum().reset_index()
fig = px.bar(sugar, x="nova_group", y="is_sugar_free",
             title="Nova Group bo'yicha 'sugar free' count",
             labels={"is_sugar_free":"Count sugar_free", "nova_group":"Nova Group"})
fig.show()

# 2c: Organik/No-organik taqsimotini stacked bar sifatida:
agg = df.groupby(["nova_group", "is_organic"]).size().reset_index(name="count")
agg["is_organic"] = agg["is_organic"].astype(str)  # for nicer legend
fig = px.bar(agg, x="nova_group", y="count", color="is_organic",
             title="Nova Group vs Organik (count)")
fig.update_layout(barmode="stack")
fig.show()


# Quantity-based segmentation — quantity_value & quantity_unit

In [13]:
import numpy as np

# 3a: Boxplot (log scale) — quantity_value by nova_group
df = df.copy()
df["quantity_value_pos"] = df["quantity_value"].apply(lambda x: x if x>0 else np.nan)  # 0 yoki <=0 larni NaN qiling
df["log_q"] = np.log1p(df["quantity_value_pos"])  # log(1+x) transform

fig = px.box(df, x="nova_group", y="log_q",
             title="Log(1+quantity_value) bo'yicha Nova Group (Boxplot)",
             labels={"log_q":"log(1+quantity_value)", "nova_group":"Nova Group"})
fig.show()

# 3b: Scatter: quantity_value vs product_age_days, ranglash nova_group bilan, hajm country_count
fig = px.scatter(df, x="quantity_value", y="product_age_days", color="nova_group",
                 size="country_count", hover_data=["main_category_encoded"],
                 title="Quantity vs Age (size=country_count) — rang=Nova Group",
                 log_x=True)  # quantity juda katta diapazonda bo'lsa log_x foydali
fig.show()

# 3c: Boxplot per unit (agar quantity_unit mavjud va ko'p unit turlari bo'lsa)
if "quantity_unit" in df.columns:
    # show top 10 units for clarity
    top_units = df["quantity_unit"].value_counts().nlargest(10).index
    sub = df[df["quantity_unit"].isin(top_units)]
    fig = px.box(sub, x="quantity_unit", y="log_q", color="nova_group",
                 title="Top units bo'yicha log(quantity) taqsimoti (top 10 units)")
    fig.show()


# Countries count — country_count vs nova_group

In [14]:
# 4a: Boxplot country_count by nova_group
fig = px.box(df, x="nova_group", y="country_count",
             title="Nova Group bo'yicha country_count (Boxplot)",
             labels={"country_count":"Country Count", "nova_group":"Nova Group"})
fig.show()

# 4b: Violin plot (taqsimot yaxshiroq ko'rinadi)
fig = px.violin(df, x="nova_group", y="country_count", box=True, points="all",
                title="Nova Group bo'yicha country_count (Violin + box + points)")
fig.show()

# 4c: O'rtacha country_count per nova_group (bar)
avg_country = df.groupby("nova_group")["country_count"].mean().reset_index()
fig = px.bar(avg_country, x="nova_group", y="country_count",
             title="Nova Group bo'yicha o'rtacha country_count")
fig.show()
