In [27]:
import joblib
import spacy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import accuracy_score
import re
from src.utils import *
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
import json

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Data

In [3]:
# Full Bayer catalog
pickle_file_path = 'data/results/matched_values.pkl'
filtered_df_noun = pd.read_pickle(pickle_file_path)
filtered_df_noun.head(2)

Unnamed: 0,cleaned_noun,category_noun
0,cylinder,"Valves, Actuator, Fittings"
1,cylinder,"Valves, Actuator, Fittings"


In [85]:
# 1k parts Adam reviewed
df_mil=pd.read_csv('data/1k_Parts/BAYER_mat_group_flags_20241206_1k‑verified_20241210.csv')
df_mil = df_mil.dropna(subset=['noun'])  # Some NaN values at the end
df_mil['cleaned_noun'] = df_mil['noun'].apply(clean_category_text)

In [5]:
# Non matched dataset
df_non_matched=pd.read_pickle('data/results/non_matched_nouns.pkl')

### Embeddings. New model

In [82]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

In [7]:
X_train_embeddings = pd.read_csv('models/embeddings/sbert_multilingual_train_embeddings.csv').values
X_test_embeddings = pd.read_csv('models/embeddings/sbert_multilingual_test_embeddings.csv').values

encoder = joblib.load('models/embeddings/label_encoder.pkl')

In [107]:
X = filtered_df_noun['cleaned_noun'].astype(str)  # Ensure input is string
y = filtered_df_noun['category_noun']
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.15, random_state=42)

In [84]:
X_train

326985                          elbow
182828                           disc
367195    lager und getriebeanordnung
131705                           ball
206907                        bushing
                     ...             
205626                         spring
447800                         nipple
228221                       retainer
255454                       tornillo
209042                        adapter
Name: cleaned_noun, Length: 233539, dtype: object

In [11]:
# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

# Convert class weights to a dictionary
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Train a Random Forest model with class weights
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight=class_weights_dict  # Apply computed weights
)
rf.fit(X_train_embeddings, y_train)

# Evaluate the model
rf_pred = rf.predict(X_test_embeddings)
print("Random Forest Results:")
print(classification_report(y_test, rf_pred, target_names=encoder.classes_))


Random Forest Results:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.94      0.94      0.94      6834
                Equipment OEM Spare Parts       0.77      0.93      0.84       765
                     Facility consumables       0.93      0.91      0.92      1677
                                Fasteners       0.95      0.98      0.96     19434
       Gas, water and sewage installation       0.96      0.96      0.96      3893
                         Piping Materials       1.00      0.94      0.97      3033
               Valves, Actuator, Fittings       0.99      0.92      0.95      5577

                                 accuracy                           0.95     41213
                                macro avg       0.94      0.94      0.94     41213
                             weighted avg       0.96      0.95      0.95     41213



In [12]:
sbert_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # Optimized version
new_embeddings = sbert_model.encode(df_mil['cleaned_noun'].tolist(), batch_size=16, show_progress_bar=True)
probabilities = rf.predict_proba(new_embeddings)
probabilities_df = pd.DataFrame(probabilities, columns=encoder.classes_)
probabilities_df = probabilities_df.round(2)  # Round for readability
df_mil['max_probability'] = probabilities_df.max(axis=1)
df_mil['predicted_category'] = probabilities_df.idxmax(axis=1)

# Check why the value retrieved in the last case is zero
df_mil=df_mil.dropna(subset=['max_probability'])

# Compute the metrics
df_mil['qa_label'] = df_mil['qa_label'].astype(str)
df_mil['predicted_category'] = df_mil['predicted_category'].astype(str)
accuracy = accuracy_score(df_mil['qa_label'], df_mil['predicted_category'])
report = classification_report(df_mil['qa_label'], df_mil['predicted_category'])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Batches:   0%|          | 0/62 [00:00<?, ?it/s]

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.59      0.59      0.59       190
                Equipment OEM Spare Parts       0.43      0.07      0.13       121
                     Facility consumables       0.33      0.41      0.37        27
                                Fasteners       0.64      0.89      0.74       329
       Gas, water and sewage installation       0.48      0.53      0.50        96
                         Piping Materials       0.69      0.54      0.61        67
               Valves, Actuator, Fittings       0.68      0.54      0.60       157

                                 accuracy                           0.60       987
                                macro avg       0.55      0.51      0.51       987
                             weighted avg       0.59      0.60      0.57       987



In [23]:
# For the unmatched nouns
# new_embeddings = pd.read_csv('models/embeddings/sbert_multilingual_train_embeddings_non_matched.csv').values
sbert_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # Optimized version
new_embeddings = sbert_model.encode(df_non_matched['cleaned_noun'].tolist(), batch_size=16, show_progress_bar=True)

# Apply for the non-matched dataframe
probabilities = rf.predict_proba(new_embeddings)
probabilities_df = pd.DataFrame(probabilities, columns=encoder.classes_)
probabilities_df = probabilities_df.round(2)  # Round for readability
df_non_matched['max_probability'] = probabilities_df.max(axis=1)
df_non_matched['predicted_category'] = probabilities_df.idxmax(axis=1)
df_non_matched.max_probability.describe()

Batches:   0%|          | 0/6755 [00:00<?, ?it/s]

count    21938.000000
mean         0.274659
std          0.088121
min          0.160000
25%          0.220000
50%          0.250000
75%          0.290000
max          0.910000
Name: max_probability, dtype: float64

In [24]:
# cleaned_noun=df_non_matched['cleaned_noun'].tolist()
# probabilities = rf.predict_proba(new_embeddings)
# probabilities_df = pd.DataFrame(probabilities, columns=encoder.classes_)
# probabilities_df = probabilities_df.round(2)
# probabilities_df['max_probability'] = probabilities_df.max(axis=1)
# probabilities_df['predicted_category'] = probabilities_df.idxmax(axis=1)
# probabilities_df['cleaned_noun'] = cleaned_noun
# probabilities_df.head

Including Grid Search

In [25]:
# # Define parameters for Grid Search (Reduced Options)
# rf_params = {
#     'n_estimators': [100, 200],       # Reduced options for number of trees
#     'max_depth': [10, None],          # Simplified depth options
#     'min_samples_split': [2, 5],      # Fewer split options
#     'min_samples_leaf': [1, 2]        # Fewer leaf options
# }

# # Create the Random Forest model
# rf = RandomForestClassifier(random_state=42)

# # Perform Grid Search with 3-fold cross-validation (faster)
# rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
# rf_grid.fit(X_train_embeddings, y_train)

# # Retrieve the best model
# best_rf = rf_grid.best_estimator_

# # Evaluate the best model
# rf_pred = best_rf.predict(X_test_embeddings)
# print("Random Forest Results:")
# print(classification_report(y_test, rf_pred, target_names=encoder.classes_))


### Analyze previous df_mil

In [26]:
df_mil_review=pd.read_pickle('df_mil_embeddings.pkl')

In [32]:
df_mil.head()

Unnamed: 0,Product Id,Quality Status,Manufacturer Name,Manufacturer PID,model,Short Description,Long Description,Additional description,score,Product family,...,script_helper,mfr_helper,noun_helper,mfr noun result match,qa_group,qa_label,review,cleaned_noun,max_probability,predicted_category
0,61959169,SPARROW_PROCESSING,,00886,,Agitator disc Polyurethan,Agitator disc Polyurethan; ; Agitator disc Pol...,Agitator disc Polyurethan ( 2 Nos.); Machnine ...,,,...,False,,False,Null result present,22104400,"Gas, water and sewage installation",agitator? Stirring maybe? A kind of turbine?,agitator disc,0.25,Fasteners
1,61999316,SPARROW_PROCESSING,,BCH16HF07330A5C,BCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C; BCH16HF07...,Canesten Bottel line,,,...,False,,False,Null result present,27140000,"Electrical Installation Materials, device",bad extraction,motor,0.98,"Valves, Actuator, Fittings"
2,61768659,SPARROW_PROCESSING,,BREITNER-ABFÜLLANLAGE,,Motor Antriebsband,Motor Antriebsband; BREITNER-ABFÜLLANLAGE;,,,,...,False,,False,Null result present,23070100,Fasteners,bad extraction,motor,0.98,"Valves, Actuator, Fittings"
3,61757206,SPARROW_PROCESSING,,,,TOOL RING SPANNER 8X9MM,TOOL RING SPANNER 8X9MM; ;,,,,...,False,,False,Null result present,99501430,Facility consumables,bad extraction,ring,0.83,Fasteners
4,62069531,SPARROW_PROCESSING,API Schmidt-Bretten,XZ-50114,XZ-50114,Dichtung SIGMA M 19 NBR f.Anfangsplatte,Dichtung SIGMA M 19 NBR f.Anfangsplatte; API ...,Benennung: Dichtung; Typ: SIGMA M 19; Art/Ausf...,,,...,False,False,,Null result present,23070100,Fasteners,bad string match,dichtung,0.62,Fasteners


Matching for the dictionaries

In [31]:
with open('data/dict/term_dict_de.json', 'r') as f:
    dict_de = json.load(f)

with open('data/dict/term_dict_en.json', 'r') as f:
    dict_en = json.load(f)

with open('data/dict/term_dict_es.json', 'r') as f:
    dict_es = json.load(f)

dict = dict_de.copy()  # Make a copy to avoid modifying the original
dict.update(dict_en)
dict.update(dict_es)

In [80]:
dict_en

{'accumulator': 'Electrical Installation Materials, device',
 'acoustic detector': 'Electrical Installation Materials, device',
 'ai': 'Electrical Installation Materials, device',
 'air duct sensor': 'Electrical Installation Materials, device',
 'alternating current motors': 'Electrical Installation Materials, device',
 'alternating current switch': 'Electrical Installation Materials, device',
 'alternator': 'Electrical Installation Materials, device',
 'ampli': 'Electrical Installation Materials, device',
 'amplif': 'Electrical Installation Materials, device',
 'amplificateur': 'Electrical Installation Materials, device',
 'amplifier': 'Electrical Installation Materials, device',
 'analog': 'Electrical Installation Materials, device',
 'analog input': 'Electrical Installation Materials, device',
 'analog input module': 'Electrical Installation Materials, device',
 'analog module': 'Electrical Installation Materials, device',
 'analog output': 'Electrical Installation Materials, device

In [33]:
pattern_dict = {term: re.compile(r'\b' + re.escape(term) + r'\b', flags=re.IGNORECASE) for term in dict}

# Function to find the matching term and assign the category
def match_terms_vectorized(noun, pattern_dict):
    for term, pattern in pattern_dict.items():
        if pattern.search(str(noun)):  
            return dict[term], term
    return None, None 

# Apply the function to the whole column
df_mil_review[['matched_category', 'matched_noun']] = df_mil_review['cleaned_noun'].apply(lambda noun: pd.Series(match_terms_vectorized(noun, pattern_dict)))

In [34]:
df_mil_review.head()

Unnamed: 0,Product Id,Quality Status,Manufacturer Name,Manufacturer PID,model,Short Description,Long Description,Additional description,score,Product family,...,noun_helper,mfr noun result match,qa_group,qa_label,review,cleaned_noun,max_probability,predicted_category,matched_category,matched_noun
0,61959169,SPARROW_PROCESSING,,00886,,Agitator disc Polyurethan,Agitator disc Polyurethan; ; Agitator disc Pol...,Agitator disc Polyurethan ( 2 Nos.); Machnine ...,,,...,False,Null result present,22104400,"Gas, water and sewage installation",agitator? Stirring maybe? A kind of turbine?,agitator disc,0.94,Fasteners,Fasteners,disc
1,61999316,SPARROW_PROCESSING,,BCH16HF07330A5C,BCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C; BCH16HF07...,Canesten Bottel line,,,...,False,Null result present,27140000,"Electrical Installation Materials, device",bad extraction,motor,1.0,"Valves, Actuator, Fittings","Valves, Actuator, Fittings",motor
2,61768659,SPARROW_PROCESSING,,BREITNER-ABFÜLLANLAGE,,Motor Antriebsband,Motor Antriebsband; BREITNER-ABFÜLLANLAGE;,,,,...,False,Null result present,23070100,Fasteners,bad extraction,motor,1.0,"Valves, Actuator, Fittings","Valves, Actuator, Fittings",motor
3,61757206,SPARROW_PROCESSING,,,,TOOL RING SPANNER 8X9MM,TOOL RING SPANNER 8X9MM; ;,,,,...,False,Null result present,99501430,Facility consumables,bad extraction,ring,1.0,Fasteners,Fasteners,ring
4,62069531,SPARROW_PROCESSING,API Schmidt-Bretten,XZ-50114,XZ-50114,Dichtung SIGMA M 19 NBR f.Anfangsplatte,Dichtung SIGMA M 19 NBR f.Anfangsplatte; API ...,Benennung: Dichtung; Typ: SIGMA M 19; Art/Ausf...,,,...,,Null result present,23070100,Fasteners,bad string match,dichtung,0.98,Fasteners,Fasteners,dichtung


In [37]:
df_mil_review.shape

(987, 33)

In [35]:
df_mil_review.matched_category.value_counts()

matched_category
Fasteners                                    404
Electrical Installation Materials, device    172
Valves, Actuator, Fittings                   141
Gas, water and sewage installation           105
Piping Materials                              58
Facility consumables                          30
Equipment OEM Spare Parts                      9
Name: count, dtype: int64

In [40]:
# Number of NaN values
column_name = 'matched_category'  
nan_count = df_mil_review[column_name].isna().sum()
print(f"Number of NaN values in '{column_name}': {nan_count}")

Number of NaN values in 'matched_category': 68


Check comparing matched nouns and unmatched

In [58]:
predicted = df_mil_review['predicted_category']
matched = df_mil_review['matched_category']
qa_label = df_mil_review['qa_label']

valid_matched = ~matched.isna()

report = classification_report(matched[valid_matched], predicted[valid_matched])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.66      0.64      0.65       172
                Equipment OEM Spare Parts       0.38      0.33      0.35         9
                     Facility consumables       0.50      0.43      0.46        30
                                Fasteners       0.78      0.84      0.81       404
       Gas, water and sewage installation       0.63      0.58      0.60       105
                         Piping Materials       0.72      0.62      0.67        58
               Valves, Actuator, Fittings       0.66      0.65      0.65       141

                                 accuracy                           0.71       919
                                macro avg       0.62      0.58      0.60       919
                             weighted avg       0.71      0.71      0.71       919



In [81]:
df_mil_review[df_mil_review['matched_noun'] == 'anillo']

Unnamed: 0,Product Id,Quality Status,Manufacturer Name,Manufacturer PID,model,Short Description,Long Description,Additional description,score,Product family,...,noun_helper,mfr noun result match,qa_group,qa_label,review,cleaned_noun,max_probability,predicted_category,matched_category,matched_noun
15,61782339,SPARROW_PROCESSING,Kilian,213813.0,,ANILLO OPRESOR;;213813,ANILLO OPRESOR;;213813; KILIAN_213813; Anillo ...,Anillo opresor para disco,,,...,False,Null result present,99501340,Equipment OEM Spare Parts,Engine device - piston clamp,anillo,0.95,Fasteners,Fasteners,anillo
84,61935909,SPARROW_PROCESSING,,,,ANILLO UNICONO 4MM,ANILLO UNICONO 4MM; ;,,,,...,False,Null result present,23070100,Fasteners,,anillo,0.72,"Electrical Installation Materials, device",Fasteners,anillo


In [88]:
df_mil_review.to_csv('df_mil_review_embeddings.csv')

In [73]:
df_mismatch = df_mil_review[df_mil_review['predicted_category'] != df_mil_review['matched_category']]
# df_mismatch = df_mismatch.dropna(subset=['predicted_category'])
# Remove rows where 'predicted_category' is None or NaN
df_mismatch = df_mismatch[df_mismatch['matched_category'].notna()]
df_mismatch.shape

(266, 33)

Check with Adams categories

In [59]:
report = classification_report(qa_label[valid_matched], predicted[valid_matched])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.62      0.59      0.60       174
                Equipment OEM Spare Parts       0.62      0.05      0.09        99
                     Facility consumables       0.42      0.46      0.44        24
                                Fasteners       0.65      0.88      0.75       319
       Gas, water and sewage installation       0.49      0.55      0.52        87
                         Piping Materials       0.72      0.55      0.63        65
               Valves, Actuator, Fittings       0.62      0.56      0.59       151

                                 accuracy                           0.62       919
                                macro avg       0.59      0.52      0.52       919
                             weighted avg       0.62      0.62      0.59       919



Check for the unmatched nouns

In [60]:
nan_matched = matched.isna()
report = classification_report(qa_label[nan_matched], predicted[nan_matched])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.31      0.31      0.31        16
                Equipment OEM Spare Parts       0.00      0.00      0.00        22
                     Facility consumables       0.00      0.00      0.00         3
                                Fasteners       0.11      0.20      0.14        10
       Gas, water and sewage installation       0.25      0.44      0.32         9
                         Piping Materials       0.00      0.00      0.00         2
               Valves, Actuator, Fittings       0.25      0.33      0.29         6

                                 accuracy                           0.19        68
                                macro avg       0.13      0.18      0.15        68
                             weighted avg       0.15      0.19      0.16        68



Adam reviews and matched

In [77]:
report = classification_report(qa_label[valid_matched], matched[valid_matched])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.88      0.87      0.88       174
                Equipment OEM Spare Parts       0.89      0.08      0.15        99
                     Facility consumables       0.60      0.75      0.67        24
                                Fasteners       0.78      0.98      0.87       319
       Gas, water and sewage installation       0.68      0.82      0.74        87
                         Piping Materials       0.95      0.85      0.89        65
               Valves, Actuator, Fittings       0.89      0.83      0.86       151

                                 accuracy                           0.81       919
                                macro avg       0.81      0.74      0.72       919
                             weighted avg       0.83      0.81      0.78       919



In [None]:
df_mil_review=pd.read_pickle('df_mil_embeddings.pkl')

In [None]:
df_mil.head()

Unnamed: 0,Product Id,Quality Status,Manufacturer Name,Manufacturer PID,model,Short Description,Long Description,Additional description,score,Product family,...,script_helper,mfr_helper,noun_helper,mfr noun result match,qa_group,qa_label,review,cleaned_noun,max_probability,predicted_category
0,61959169,SPARROW_PROCESSING,,00886,,Agitator disc Polyurethan,Agitator disc Polyurethan; ; Agitator disc Pol...,Agitator disc Polyurethan ( 2 Nos.); Machnine ...,,,...,False,,False,Null result present,22104400,"Gas, water and sewage installation",agitator? Stirring maybe? A kind of turbine?,agitator disc,0.25,Fasteners
1,61999316,SPARROW_PROCESSING,,BCH16HF07330A5C,BCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C; BCH16HF07...,Canesten Bottel line,,,...,False,,False,Null result present,27140000,"Electrical Installation Materials, device",bad extraction,motor,0.98,"Valves, Actuator, Fittings"
2,61768659,SPARROW_PROCESSING,,BREITNER-ABFÜLLANLAGE,,Motor Antriebsband,Motor Antriebsband; BREITNER-ABFÜLLANLAGE;,,,,...,False,,False,Null result present,23070100,Fasteners,bad extraction,motor,0.98,"Valves, Actuator, Fittings"
3,61757206,SPARROW_PROCESSING,,,,TOOL RING SPANNER 8X9MM,TOOL RING SPANNER 8X9MM; ;,,,,...,False,,False,Null result present,99501430,Facility consumables,bad extraction,ring,0.83,Fasteners
4,62069531,SPARROW_PROCESSING,API Schmidt-Bretten,XZ-50114,XZ-50114,Dichtung SIGMA M 19 NBR f.Anfangsplatte,Dichtung SIGMA M 19 NBR f.Anfangsplatte; API ...,Benennung: Dichtung; Typ: SIGMA M 19; Art/Ausf...,,,...,False,False,,Null result present,23070100,Fasteners,bad string match,dichtung,0.62,Fasteners


Matching for the dictionaries

In [89]:
with open('data/dict/term_dict_de.json', 'r') as f:
    dict_de = json.load(f)

with open('data/dict/term_dict_en.json', 'r') as f:
    dict_en = json.load(f)

with open('data/dict/term_dict_es.json', 'r') as f:
    dict_es = json.load(f)

dict = dict_de.copy()  # Make a copy to avoid modifying the original
dict.update(dict_en)
dict.update(dict_es)

In [90]:
dict_en

{'accumulator': 'Electrical Installation Materials, device',
 'acoustic detector': 'Electrical Installation Materials, device',
 'ai': 'Electrical Installation Materials, device',
 'air duct sensor': 'Electrical Installation Materials, device',
 'alternating current motors': 'Electrical Installation Materials, device',
 'alternating current switch': 'Electrical Installation Materials, device',
 'alternator': 'Electrical Installation Materials, device',
 'ampli': 'Electrical Installation Materials, device',
 'amplif': 'Electrical Installation Materials, device',
 'amplificateur': 'Electrical Installation Materials, device',
 'amplifier': 'Electrical Installation Materials, device',
 'analog': 'Electrical Installation Materials, device',
 'analog input': 'Electrical Installation Materials, device',
 'analog input module': 'Electrical Installation Materials, device',
 'analog module': 'Electrical Installation Materials, device',
 'analog output': 'Electrical Installation Materials, device

In [91]:
pattern_dict = {term: re.compile(r'\b' + re.escape(term) + r'\b', flags=re.IGNORECASE) for term in dict}

# Function to find the matching term and assign the category
def match_terms_vectorized(noun, pattern_dict):
    for term, pattern in pattern_dict.items():
        if pattern.search(str(noun)):  
            return dict[term], term
    return None, None 

# Apply the function to the whole column
df_mil_review[['matched_category', 'matched_noun']] = df_mil_review['cleaned_noun'].apply(lambda noun: pd.Series(match_terms_vectorized(noun, pattern_dict)))

In [92]:
df_mil_review.head()

Unnamed: 0,Product Id,Quality Status,Manufacturer Name,Manufacturer PID,model,Short Description,Long Description,Additional description,score,Product family,...,noun_helper,mfr noun result match,qa_group,qa_label,review,cleaned_noun,max_probability,predicted_category,matched_category,matched_noun
0,61959169,SPARROW_PROCESSING,,00886,,Agitator disc Polyurethan,Agitator disc Polyurethan; ; Agitator disc Pol...,Agitator disc Polyurethan ( 2 Nos.); Machnine ...,,,...,False,Null result present,22104400,"Gas, water and sewage installation",agitator? Stirring maybe? A kind of turbine?,agitator disc,0.94,Fasteners,Fasteners,disc
1,61999316,SPARROW_PROCESSING,,BCH16HF07330A5C,BCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C; BCH16HF07...,Canesten Bottel line,,,...,False,Null result present,27140000,"Electrical Installation Materials, device",bad extraction,motor,1.0,"Valves, Actuator, Fittings","Valves, Actuator, Fittings",motor
2,61768659,SPARROW_PROCESSING,,BREITNER-ABFÜLLANLAGE,,Motor Antriebsband,Motor Antriebsband; BREITNER-ABFÜLLANLAGE;,,,,...,False,Null result present,23070100,Fasteners,bad extraction,motor,1.0,"Valves, Actuator, Fittings","Valves, Actuator, Fittings",motor
3,61757206,SPARROW_PROCESSING,,,,TOOL RING SPANNER 8X9MM,TOOL RING SPANNER 8X9MM; ;,,,,...,False,Null result present,99501430,Facility consumables,bad extraction,ring,1.0,Fasteners,Fasteners,ring
4,62069531,SPARROW_PROCESSING,API Schmidt-Bretten,XZ-50114,XZ-50114,Dichtung SIGMA M 19 NBR f.Anfangsplatte,Dichtung SIGMA M 19 NBR f.Anfangsplatte; API ...,Benennung: Dichtung; Typ: SIGMA M 19; Art/Ausf...,,,...,,Null result present,23070100,Fasteners,bad string match,dichtung,0.98,Fasteners,Fasteners,dichtung


In [93]:
df_mil_review.shape

(987, 33)

In [94]:
df_mil_review.matched_category.value_counts()

matched_category
Fasteners                                    404
Electrical Installation Materials, device    172
Valves, Actuator, Fittings                   141
Gas, water and sewage installation           105
Piping Materials                              58
Facility consumables                          30
Equipment OEM Spare Parts                      9
Name: count, dtype: int64

In [95]:
# Number of NaN values
column_name = 'matched_category'  
nan_count = df_mil_review[column_name].isna().sum()
print(f"Number of NaN values in '{column_name}': {nan_count}")

Number of NaN values in 'matched_category': 68


In [97]:
import openai

In [101]:
openai.api_key = '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'

In [104]:
X_test.shape

(41213,)

In [108]:
y_test

347020           Gas, water and sewage installation
29901                                     Fasteners
405840                                    Fasteners
308164                                    Fasteners
217051                             Piping Materials
                            ...                    
212495                                    Fasteners
335522    Electrical Installation Materials, device
371074                                    Fasteners
193726    Electrical Installation Materials, device
476410                   Valves, Actuator, Fittings
Name: category_noun, Length: 41213, dtype: object

In [110]:
# Prepare inputs and actual labels from the dataset
inputs = X_test[0:1000].tolist()
actual_labels =y_test[0:1000].tolist()

# Generate predictions using the fine-tuned model
predictions = []

for text in inputs:
    try:
        response = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0125:sparrow::AnNDUonF",
            messages=[
                {"role": "system", "content": "You are a helpful assistant for product classification."},
                {"role": "user", "content": text}
            ]
        )
        predictions.append(response['choices'][0]['message']['content'].strip())
    except Exception as e:
        predictions.append("Error")  # Handle any errors during API call

# Compute accuracy and classification report
y_test_pred = predictions
accuracy = accuracy_score(actual_labels, predictions)
report = classification_report(actual_labels, predictions, zero_division=0)

# Display results
report_df = pd.DataFrame(classification_report(actual_labels, predictions, output_dict=True)).transpose()
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.96


In [111]:
df_non_matched.head()

Unnamed: 0,product_id,quality_status,manufacturer_name,manufacturer_pid,model,short_description,long_description,additional_description,noun,norm,scope,site,cleaned_noun,category_noun,matched_noun,word_count,max_probability,predicted_category
13,101050474,SPARROW_APPROVED,CKD,STR2-B-10-10-6,STR2-B-10-10-6,ツインロッドair cylinder,,,Höinrodd,,In,YA2Z Shiga,höinrodd,,,1,0.24,Fasteners
14,101050473,SPARROW_APPROVED,CKD,SSD-KL-32C-60-T3H3-D-N-8,SSD-KL-32C-60-T3H3-D-N-8,スーパーコンパクトair cylinder,,,スüßbürz,,In,YA2Z Shiga,スüßbürz,,,1,0.3,"Electrical Installation Materials, device"
15,101050472,SPARROW_APPROVED,CKD,SSD-KL-40-250-T3H3-D-N-8,SSD-KL-40-250-T3H3-D-N-8,スーパーコンパクトair cylinder,,,スüßbürz,,In,YA2Z Shiga,スüßbürz,,,1,0.23,Fasteners
34,101050454,SPARROW_PROCESSING,CKD,SMG-L-20-25-K3V3-D,SMG-L-20-25-K3V3-D,スーパーマウントair cylinder,,,スüßbema,,In,YA2Z Shiga,スüßbema,,,1,0.24,Fasteners
35,101050453,SPARROW_PROCESSING,Habasit,H-4EMDT,,ベルト（450×1860）,,,Белрт,,In,YA2Z Shiga,белрт,,,1,0.24,Fasteners


In [114]:
# Prepare inputs and actual labels from the dataset
inputs = df_non_matched['cleaned_noun'][0:1000].tolist()
# actual_labels =y_test[0:1000].tolist()

# Generate predictions using the fine-tuned model
predictions_non_match = []

for text in inputs:
    try:
        response = openai.ChatCompletion.create(
            model="ft:gpt-3.5-turbo-0125:sparrow::AnNDUonF",
            messages=[
                {"role": "system", "content": "You are a helpful assistant for product classification."},
                {"role": "user", "content": text}
            ]
        )
        predictions_non_match.append(response['choices'][0]['message']['content'].strip())
    except Exception as e:
        predictions_non_match.append("Error")  # Handle any errors during API call

# Compute accuracy and classification report
y_test_pred_non_matched = predictions_non_match
# accuracy = accuracy_score(actual_labels, predictions)
# report = classification_report(actual_labels, predictions, zero_division=0)

# # Display results
# report_df = pd.DataFrame(classification_report(actual_labels, predictions, output_dict=True)).transpose()
# print(f"Accuracy: {accuracy:.2f}")

Check comparing matched nouns and unmatched

In [96]:
predicted = df_mil_review['predicted_category']
matched = df_mil_review['matched_category']
qa_label = df_mil_review['qa_label']

valid_matched = ~matched.isna()

report = classification_report(matched[valid_matched], predicted[valid_matched])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.66      0.64      0.65       172
                Equipment OEM Spare Parts       0.38      0.33      0.35         9
                     Facility consumables       0.50      0.43      0.46        30
                                Fasteners       0.78      0.84      0.81       404
       Gas, water and sewage installation       0.63      0.58      0.60       105
                         Piping Materials       0.72      0.62      0.67        58
               Valves, Actuator, Fittings       0.66      0.65      0.65       141

                                 accuracy                           0.71       919
                                macro avg       0.62      0.58      0.60       919
                             weighted avg       0.71      0.71      0.71       919



In [None]:
df_mil_review[df_mil_review['matched_noun'] == 'anillo']

Unnamed: 0,Product Id,Quality Status,Manufacturer Name,Manufacturer PID,model,Short Description,Long Description,Additional description,score,Product family,...,noun_helper,mfr noun result match,qa_group,qa_label,review,cleaned_noun,max_probability,predicted_category,matched_category,matched_noun
15,61782339,SPARROW_PROCESSING,Kilian,213813.0,,ANILLO OPRESOR;;213813,ANILLO OPRESOR;;213813; KILIAN_213813; Anillo ...,Anillo opresor para disco,,,...,False,Null result present,99501340,Equipment OEM Spare Parts,Engine device - piston clamp,anillo,0.95,Fasteners,Fasteners,anillo
84,61935909,SPARROW_PROCESSING,,,,ANILLO UNICONO 4MM,ANILLO UNICONO 4MM; ;,,,,...,False,Null result present,23070100,Fasteners,,anillo,0.72,"Electrical Installation Materials, device",Fasteners,anillo


In [None]:
df_mismatch = df_mil_review[df_mil_review['predicted_category'] != df_mil_review['matched_category']]
# df_mismatch = df_mismatch.dropna(subset=['predicted_category'])
# Remove rows where 'predicted_category' is None or NaN
df_mismatch = df_mismatch[df_mismatch['matched_category'].notna()]
df_mismatch.shape

(266, 33)

Check with Adams categories

In [None]:
report = classification_report(qa_label[valid_matched], predicted[valid_matched])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.62      0.59      0.60       174
                Equipment OEM Spare Parts       0.62      0.05      0.09        99
                     Facility consumables       0.42      0.46      0.44        24
                                Fasteners       0.65      0.88      0.75       319
       Gas, water and sewage installation       0.49      0.55      0.52        87
                         Piping Materials       0.72      0.55      0.63        65
               Valves, Actuator, Fittings       0.62      0.56      0.59       151

                                 accuracy                           0.62       919
                                macro avg       0.59      0.52      0.52       919
                             weighted avg       0.62      0.62      0.59       919



Check for the unmatched nouns

In [None]:
nan_matched = matched.isna()
report = classification_report(qa_label[nan_matched], predicted[nan_matched])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.31      0.31      0.31        16
                Equipment OEM Spare Parts       0.00      0.00      0.00        22
                     Facility consumables       0.00      0.00      0.00         3
                                Fasteners       0.11      0.20      0.14        10
       Gas, water and sewage installation       0.25      0.44      0.32         9
                         Piping Materials       0.00      0.00      0.00         2
               Valves, Actuator, Fittings       0.25      0.33      0.29         6

                                 accuracy                           0.19        68
                                macro avg       0.13      0.18      0.15        68
                             weighted avg       0.15      0.19      0.16        68



Adam reviews and matched

In [None]:
report = classification_report(qa_label[valid_matched], matched[valid_matched])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.60

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.88      0.87      0.88       174
                Equipment OEM Spare Parts       0.89      0.08      0.15        99
                     Facility consumables       0.60      0.75      0.67        24
                                Fasteners       0.78      0.98      0.87       319
       Gas, water and sewage installation       0.68      0.82      0.74        87
                         Piping Materials       0.95      0.85      0.89        65
               Valves, Actuator, Fittings       0.89      0.83      0.86       151

                                 accuracy                           0.81       919
                                macro avg       0.81      0.74      0.72       919
                             weighted avg       0.83      0.81      0.78       919

