#Predikcija kategorije proizvoda na osnovu naslova 
#Cilj projekta:
#Cilj ovog projekta je izgradnja modela masinskog ucenja koji,na osnovu naziva proizvoda(Product Title),predvidja njegovu kategoriju(Category Label).
#U ovoj svesci radimo: analizu podataka, osnovni feature engineering, pripremu za treniranje modela.

In [19]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
plt.style.use("default")

#Ucitavnaje podataka 

In [20]:
df=pd.read_csv("../data/products.csv")
df.head()

Unnamed: 0,product ID,Product Title,Merchant ID,Category Label,_Product Code,Number_of_Views,Merchant Rating,Listing Date
0,1,apple iphone 8 plus 64gb silver,1,Mobile Phones,QA-2276-XC,860.0,2.5,5/10/2024
1,2,apple iphone 8 plus 64 gb spacegrau,2,Mobile Phones,KA-2501-QO,3772.0,4.8,12/31/2024
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,Mobile Phones,FP-8086-IE,3092.0,3.9,11/10/2024
3,4,apple iphone 8 plus 64gb space grey,4,Mobile Phones,YI-0086-US,466.0,3.4,5/2/2022
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,Mobile Phones,NZ-3586-WP,4426.0,1.6,4/12/2023


In [21]:
print("Shape:", df.shape)
df.info()

Shape: (35311, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35311 entries, 0 to 35310
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product ID       35311 non-null  int64  
 1   Product Title    35139 non-null  object 
 2   Merchant ID      35311 non-null  int64  
 3    Category Label  35267 non-null  object 
 4   _Product Code    35216 non-null  object 
 5   Number_of_Views  35297 non-null  float64
 6   Merchant Rating  35141 non-null  float64
 7    Listing Date    35252 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 2.2+ MB


In [22]:
df.columns

Index(['product ID', 'Product Title', 'Merchant ID', ' Category Label',
       '_Product Code', 'Number_of_Views', 'Merchant Rating',
       ' Listing Date  '],
      dtype='object')

#Feature engineering

In [23]:
df["Product Title"]=df["Product Title"].fillna("").astype(str)

In [24]:
import re 
df["word_count"]=df["Product Title"].apply(lambda x: len(x.split()))
df["has_number"]= df["Product Title"].apply(lambda x:int(bool(re.search(r"\d",x))))
df["has_uppercase_word"]=df["Product Title"].apply(lambda x:int(any(word.isupper() and len(word)> 1 for word in x.split())))
df[["word_count","has_number","has_uppercase_word"]].head()

Unnamed: 0,word_count,has_number,has_uppercase_word
0,6,1,0
1,7,1,0
2,13,1,0
3,7,1,0
4,11,1,0


#Exploratory Data Analysis (EDA)


In [25]:
df.isnull().sum()

product ID              0
Product Title           0
Merchant ID             0
 Category Label        44
_Product Code          95
Number_of_Views        14
Merchant Rating       170
 Listing Date          59
word_count              0
has_number              0
has_uppercase_word      0
dtype: int64

#Koliko ima kategorija

In [26]:
df[" Category Label"].value_counts()

 Category Label
Fridge Freezers     5495
Washing Machines    4036
Mobile Phones       4020
CPUs                3771
TVs                 3564
Fridges             3457
Dishwashers         3418
Digital Cameras     2696
Microwaves          2338
Freezers            2210
fridge               123
CPU                   84
Mobile Phone          55
Name: count, dtype: int64

In [27]:
df[" Category Label"].value_counts().head(10)

 Category Label
Fridge Freezers     5495
Washing Machines    4036
Mobile Phones       4020
CPUs                3771
TVs                 3564
Fridges             3457
Dishwashers         3418
Digital Cameras     2696
Microwaves          2338
Freezers            2210
Name: count, dtype: int64

#Zadrzavanje samo bitnih kolona 

In [28]:
df_ml=df[["Product Title", " Category Label"]].dropna()
df_ml.head()

Unnamed: 0,Product Title,Category Label
0,apple iphone 8 plus 64gb silver,Mobile Phones
1,apple iphone 8 plus 64 gb spacegrau,Mobile Phones
2,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,Mobile Phones
3,apple iphone 8 plus 64gb space grey,Mobile Phones
4,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,Mobile Phones


In [29]:
df_ml.shape

(35267, 2)

#Razdvajanje X i y

In [30]:
X=df_ml["Product Title"]
y=df_ml[" Category Label"]

#Train i test split

In [31]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

#Model 1: Logistic Regression (Baseline model ili stari model)

In [32]:
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
model=Pipeline([("tfidf",TfidfVectorizer(lowercase=True,stop_words="english",max_features=5000)),("clf",LogisticRegression(max_iter=1000))])

In [33]:
model.fit(X_train,y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [34]:
from sklearn.metrics import accuracy_score
y_pred_log=model.predict(X_test)
acc_log=accuracy_score(y_test,y_pred_log)
acc_log

0.9428692940175787

#Model 2: Naive Bayes(MultinomialNB)

In [35]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
nb_model=Pipeline([("tfidf",TfidfVectorizer(lowercase=True,stop_words="english")),("nb",MultinomialNB())])
nb_model.fit(X_train, y_train)
y_pred_nb=nb_model.predict(X_test)
acc_nb=accuracy_score(y_test,y_pred_nb)
acc_nb

0.9233059257159059

#Model 3: Linear SVM(Linear SVC)

In [36]:
from sklearn.svm import LinearSVC
svm_model=Pipeline([("tfidf",TfidfVectorizer(lowercase=True,stop_words="english")),("svm",LinearSVC())])
svm_model.fit(X_train, y_train)
y_pred_svm=svm_model.predict(X_test)
acc_svm=accuracy_score(y_test, y_pred_svm)
acc_svm

0.9567621207825348

#Uporedni pregled

In [37]:
import pandas as pd 
results=pd.DataFrame({"Model":["Logistic Regression","Naive Bayes","Linear SVM"],"Accuracy":[acc_log,acc_nb,acc_svm]})
results.sort_values("Accuracy",ascending=False)

Unnamed: 0,Model,Accuracy
2,Linear SVM,0.956762
0,Logistic Regression,0.942869
1,Naive Bayes,0.923306


#Nakon provodjenja Logistic Regression,Naive Bayes i Linear SVM modela, Linear SVM je ostvario najvecu tacnost (~96,7%).
#Zbog toga je izabran kao finalni model za treniranje i produkciju.

#Finalni model (TF-IDF + engineered features)

Na osnovu predhodnih eksperimenata i uporedne analize, odabran je finalni model koji kombinuje tekstualne i numericke feature-e putem ColumnTransformer-a.

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

#Train/test split

In [39]:
df_final=df[["Product Title","word_count","has_number","has_uppercase_word"," Category Label"]].dropna()

In [40]:
X=df_final[["Product Title","word_count","has_number","has_uppercase_word"]]
y=df_final[" Category Label"]

In [41]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

#Preprocessing(tekst + numerika)

In [42]:
preprocessor=ColumnTransformer(transformers=[("text",TfidfVectorizer(lowercase=True,stop_words="english",max_features=5000),"Product Title"),("num",StandardScaler(),["word_count","has_number","has_uppercase_word"])])

#Finalni pipeline 

In [43]:
final_model=Pipeline(steps=[("preprocessor",preprocessor),("classifier",LogisticRegression(max_iter=1000))])

#Treniranje

In [44]:
final_model.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


#Evaluacija

In [45]:
y_pred=final_model.predict(X_test)
print("Accuarcy:",accuracy_score(y_test,y_pred))
print("\nClassification report: \n")
print(classification_report(y_test,y_pred))

Accuarcy: 0.9440034023249221

Classification report: 

                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.98      0.99      0.98       754
 Digital Cameras       0.99      0.99      0.99       539
     Dishwashers       0.94      0.95      0.94       684
        Freezers       0.98      0.88      0.93       442
 Fridge Freezers       0.92      0.93      0.92      1099
         Fridges       0.86      0.88      0.87       691
      Microwaves       1.00      0.96      0.98       468
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.92      1.00      0.96       804
             TVs       0.98      0.97      0.98       713
Washing Machines       0.94      0.95      0.95       807
          fridge       0.00      0.00      0.00        25

        accuracy                           0.94      7054
       macro avg       0.73      0.73      0.73      7054
    weighted av

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


#Model postize 94% tacnosti.
#Slabije klase imaju mali broj uzoraka,sto utice na macro prosjek, ali weighted prosjek potvrdjuje stabilnost modela.