In [26]:
import pickle
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import numpy as np
import joblib
import re
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  # Important for combining SMOTE with other steps
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from sklearn.preprocessing import LabelEncoder

import spacy
# from tqdm import tq

from src.utils import *

## 1) Data

In [3]:
df_mil=pd.read_csv('data/1k_Parts/BAYER_mat_group_flags_20241206_1k‑verified_20241210.csv')
df_mil.head(3)
df_mil['cleaned_noun'] = df_mil['noun'].apply(clean_category_text)

#### Training data with the matched values

In [6]:
matched_nouns=pd.read_pickle('data/results/matched_values.pkl')

In [7]:
matched_nouns.head(120)

Unnamed: 0,cleaned_noun,category_noun
0,cylinder,"Valves, Actuator, Fittings"
1,cylinder,"Valves, Actuator, Fittings"
2,cylinder,"Valves, Actuator, Fittings"
3,cylinder,"Valves, Actuator, Fittings"
4,cylinder,"Valves, Actuator, Fittings"
...,...,...
2630,pressure gauge,"Gas, water and sewage installation"
2633,pressure safety valve,"Gas, water and sewage installation"
2634,repair kit,Equipment OEM Spare Parts
2636,ring,Fasteners


In [8]:
matched_nouns.shape

(274752, 2)

In [9]:
non_matched_df= pd.read_pickle('data/results/non_matched_nouns.pkl')

## 2) Models

#### 2.1) Naive Bayes model

In [10]:
# filtered_df['cleaned_noun'] = filtered_df['cleaned_noun'].str.lower()
X = matched_nouns['cleaned_noun']  # Features (cleaned nouns)
y = matched_nouns['category_noun']  # Labels (categories)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [11]:
model_nb = make_pipeline(TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95), MultinomialNB())
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)
print(classification_report(y_test, y_pred_nb))

                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.99      0.98      0.99      6834
                Equipment OEM Spare Parts       0.98      0.94      0.96       765
                     Facility consumables       1.00      0.97      0.98      1677
                                Fasteners       0.99      1.00      0.99     19434
       Gas, water and sewage installation       0.99      0.98      0.99      3893
                         Piping Materials       1.00      0.99      0.99      3033
               Valves, Actuator, Fittings       0.99      0.99      0.99      5577

                                 accuracy                           0.99     41213
                                macro avg       0.99      0.98      0.98     41213
                             weighted avg       0.99      0.99      0.99     41213



In [12]:
# For the 1k Adams parts
new_data = df_mil['cleaned_noun'].tolist()
probabilities = model_nb.predict_proba(new_data)
probabilities_df = pd.DataFrame(probabilities, columns=model_nb.classes_)
probabilities_df = probabilities_df.round(2)
df_mil['max_probability'] = probabilities_df.max(axis=1)
df_mil['predicted_category'] = probabilities_df.idxmax(axis=1)

# Performance metrics
df_mil['qa_label'] = df_mil['qa_label'].astype(str)  # or 'int' depending on your data type
df_mil['predicted_category'] = df_mil['predicted_category'].astype(str)  # or 'int'
accuracy = accuracy_score(df_mil['qa_label'], df_mil['predicted_category'])
report = classification_report(df_mil['qa_label'], df_mil['predicted_category'])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.76

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.89      0.77      0.83       195
                Equipment OEM Spare Parts       1.00      0.07      0.14       121
                     Facility consumables       0.63      0.63      0.63        27
                                Fasteners       0.69      0.99      0.81       335
       Gas, water and sewage installation       0.65      0.75      0.70        97
                         Piping Materials       0.95      0.81      0.87        67
               Valves, Actuator, Fittings       0.88      0.78      0.83       158

                                 accuracy                           0.76      1000
                                macro avg       0.81      0.69      0.69      1000
                             weighted avg       0.81      0.76      0.72      1000



In [15]:
# Apply it to the unmatched values
# loaded_model = joblib.load('models/text_classification_model_full_catalog.joblib')
new_data = non_matched_df['cleaned_noun'].tolist()  # Replace with your actual new data
probabilities = model_nb.predict_proba(new_data)
probabilities_df = pd.DataFrame(probabilities, columns=model_nb.classes_)
probabilities_df = probabilities_df.round(2)
probabilities_df['max_probability'] = probabilities_df.max(axis=1)
probabilities_df['predicted_category'] = probabilities_df.idxmax(axis=1)
probabilities_df['cleaned_noun'] = new_data

In [16]:
probabilities_df.max_probability.describe()  # Very low values for the maximum probability

count    108067.000000
mean          0.469675
std           0.096240
min           0.170000
25%           0.470000
50%           0.470000
75%           0.470000
max           0.980000
Name: max_probability, dtype: float64

In [18]:
probabilities_df   # Again biased to the biggest category

Unnamed: 0,"Electrical Installation Materials, device",Equipment OEM Spare Parts,Facility consumables,Fasteners,"Gas, water and sewage installation",Piping Materials,"Valves, Actuator, Fittings",max_probability,predicted_category,cleaned_noun
0,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,höinrodd
1,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,スüßbürz
2,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,スüßbürz
3,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,スüßbema
4,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,белрт
...,...,...,...,...,...,...,...,...,...,...
108062,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,carrilera
108063,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,tapas
108064,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,columna
108065,0.16,0.02,0.04,0.47,0.09,0.07,0.13,0.47,Fasteners,rollo


#### 2.2) Support Vector Machine

In [19]:
model_svm = make_pipeline(TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95),LinearSVC())
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.97      1.00      0.98      6834
                Equipment OEM Spare Parts       0.99      0.99      0.99       765
                     Facility consumables       1.00      0.98      0.99      1677
                                Fasteners       1.00      1.00      1.00     19434
       Gas, water and sewage installation       1.00      0.99      0.99      3893
                         Piping Materials       1.00      1.00      1.00      3033
               Valves, Actuator, Fittings       1.00      0.99      1.00      5577

                                 accuracy                           0.99     41213
                                macro avg       0.99      0.99      0.99     41213
                             weighted avg       0.99      0.99      0.99     41213



In [20]:
# For the 1k parts
new_data = df_mil['cleaned_noun'].tolist()
probabilities = model_svm.predict_proba(new_data)
probabilities_df = pd.DataFrame(probabilities, columns=model_svm.classes_)
probabilities_df = probabilities_df.round(2)
df_mil['max_probability'] = probabilities_df.max(axis=1)
df_mil['predicted_category'] = probabilities_df.idxmax(axis=1)

df_mil['qa_label'] = df_mil['qa_label'].astype(str)  # or 'int' depending on your data type
df_mil['predicted_category'] = df_mil['predicted_category'].astype(str)  # or 'int'
accuracy = accuracy_score(df_mil['qa_label'], df_mil['predicted_category'])
report = classification_report(df_mil['qa_label'], df_mil['predicted_category'])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")

AttributeError: This 'Pipeline' has no attribute 'predict_proba'

In [21]:
# For the unmatched values
new_data = non_matched_df['cleaned_noun'].tolist()  # Replace with your actual new data
probabilities = model_svm.predict_proba(new_data)
probabilities_df = pd.DataFrame(probabilities, columns=model_svm.classes_)
probabilities_df = probabilities_df.round(2)
probabilities_df['max_probability'] = probabilities_df.max(axis=1)
probabilities_df['predicted_category'] = probabilities_df.idxmax(axis=1)
probabilities_df['cleaned_noun'] = new_data

AttributeError: This 'Pipeline' has no attribute 'predict_proba'

#### 2.3) Random Forest

In [22]:
model_rf = make_pipeline(TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95),
                         RandomForestClassifier(n_estimators=100, random_state=42))
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.97      0.99      0.98      6834
                Equipment OEM Spare Parts       0.98      0.98      0.98       765
                     Facility consumables       1.00      0.97      0.98      1677
                                Fasteners       1.00      1.00      1.00     19434
       Gas, water and sewage installation       1.00      0.99      0.99      3893
                         Piping Materials       1.00      1.00      1.00      3033
               Valves, Actuator, Fittings       1.00      0.99      1.00      5577

                                 accuracy                           0.99     41213
                                macro avg       0.99      0.99      0.99     41213
                             weighted avg       0.99      0.99      0.99     41213



In [23]:
# 1k parts
new_data = df_mil['cleaned_noun'].tolist()
probabilities = model_rf.predict_proba(new_data)
probabilities_df = pd.DataFrame(probabilities, columns=model_rf.classes_)
probabilities_df = probabilities_df.round(2)
df_mil['max_probability'] = probabilities_df.max(axis=1)
df_mil['predicted_category'] = probabilities_df.idxmax(axis=1)

df_mil['qa_label'] = df_mil['qa_label'].astype(str)  # or 'int' depending on your data type
df_mil['predicted_category'] = df_mil['predicted_category'].astype(str)  # or 'int'
accuracy = accuracy_score(df_mil['qa_label'], df_mil['predicted_category'])
report = classification_report(df_mil['qa_label'], df_mil['predicted_category'])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.76

Classification Report:
                                           precision    recall  f1-score   support

Electrical Installation Materials, device       0.66      0.89      0.76       195
                Equipment OEM Spare Parts       1.00      0.07      0.12       121
                     Facility consumables       0.64      0.67      0.65        27
                                Fasteners       0.78      0.93      0.85       335
       Gas, water and sewage installation       0.68      0.72      0.70        97
                         Piping Materials       0.96      0.81      0.88        67
               Valves, Actuator, Fittings       0.87      0.78      0.82       158

                                 accuracy                           0.76      1000
                                macro avg       0.80      0.70      0.68      1000
                             weighted avg       0.80      0.76      0.72      1000



In [25]:
# For the unmatched values
new_data = non_matched_df['cleaned_noun'].tolist()  # Replace with your actual new data
probabilities = model_rf.predict_proba(new_data)
probabilities_df = pd.DataFrame(probabilities, columns=model_rf.classes_)
probabilities_df = probabilities_df.round(2)
probabilities_df['max_probability'] = probabilities_df.max(axis=1)
probabilities_df['predicted_category'] = probabilities_df.idxmax(axis=1)
probabilities_df['cleaned_noun'] = new_data
probabilities_df

In [28]:
probabilities_df    # Again same probabilities

Unnamed: 0,"Electrical Installation Materials, device",Equipment OEM Spare Parts,Facility consumables,Fasteners,"Gas, water and sewage installation",Piping Materials,"Valves, Actuator, Fittings",max_probability,predicted_category,cleaned_noun
0,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",höinrodd
1,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",スüßbürz
2,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",スüßbürz
3,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",スüßbema
4,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",белрт
...,...,...,...,...,...,...,...,...,...,...
108062,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",carrilera
108063,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",tapas
108064,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",columna
108065,0.39,0.02,0.09,0.2,0.14,0.03,0.14,0.39,"Electrical Installation Materials, device",rollo


#### 2.4) XGBoost model

In [35]:
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# Define the model pipeline
model_xgb = make_pipeline(
    TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.95),
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
)

# Train the model
model_xgb.fit(X_train, y_train_enc)

# Predict and evaluate
y_pred_xgb = model_xgb.predict(X_test)
print(classification_report(y_test_enc, y_pred_xgb, target_names=encoder.classes_))

Parameters: { "use_label_encoder" } are not used.



                                           precision    recall  f1-score   support

Electrical Installation Materials, device       1.00      0.80      0.89      6834
                Equipment OEM Spare Parts       0.98      0.98      0.98       765
                     Facility consumables       1.00      0.87      0.93      1677
                                Fasteners       0.90      1.00      0.94     19434
       Gas, water and sewage installation       1.00      0.92      0.96      3893
                         Piping Materials       1.00      0.98      0.99      3033
               Valves, Actuator, Fittings       0.99      0.94      0.97      5577

                                 accuracy                           0.94     41213
                                macro avg       0.98      0.93      0.95     41213
                             weighted avg       0.95      0.94      0.94     41213



In [36]:
new_data = df_mil['cleaned_noun'].tolist()
probabilities = model_xgb.predict_proba(new_data)
probabilities_df = pd.DataFrame(probabilities, columns=model_xgb.classes_)
probabilities_df = probabilities_df.round(2)
df_mil['max_probability'] = probabilities_df.max(axis=1)
df_mil['predicted_category'] = probabilities_df.idxmax(axis=1)

In [37]:
df_mil['qa_label'] = df_mil['qa_label'].astype(str)  # or 'int' depending on your data type
df_mil['predicted_category'] = df_mil['predicted_category'].astype(str)  # or 'int'
accuracy = accuracy_score(df_mil['qa_label'], df_mil['predicted_category'])
report = classification_report(df_mil['qa_label'], df_mil['predicted_category'])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Accuracy: 0.00

Classification Report:
                                           precision    recall  f1-score   support

                                        0       0.00      0.00      0.00       0.0
                                        1       0.00      0.00      0.00       0.0
                                        2       0.00      0.00      0.00       0.0
                                        3       0.00      0.00      0.00       0.0
                                        4       0.00      0.00      0.00       0.0
                                        5       0.00      0.00      0.00       0.0
                                        6       0.00      0.00      0.00       0.0
Electrical Installation Materials, device       0.00      0.00      0.00     195.0
                Equipment OEM Spare Parts       0.00      0.00      0.00     121.0
                     Facility consumables       0.00      0.00      0.00      27.0
                                Fasteners      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100)

# Encode labels
encoder = LabelEncoder()
y_train_enc = encoder.fit_transform(y_train)
y_test_enc = encoder.transform(y_test)

# Model
model_nn = Sequential()
model_nn.add(Dense(128, activation='relu', input_dim=100))
model_nn.add(Dropout(0.5))
model_nn.add(Dense(64, activation='relu'))
model_nn.add(Dropout(0.5))
model_nn.add(Dense(len(encoder.classes_), activation='softmax'))

model_nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_nn.fit(X_train_seq, y_train_enc, epochs=10, batch_size=32, validation_data=(X_test_seq, y_test_enc))

# Evaluation
y_pred_nn = model_nn.predict(X_test_seq).argmax(axis=1)
print(classification_report(y_test_enc, y_pred_nn, target_names=encoder.classes_))


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7299/7299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - accuracy: 0.4459 - loss: 4.6745 - val_accuracy: 0.5064 - val_loss: 1.3965
Epoch 2/10
[1m7299/7299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.5059 - loss: 1.3913 - val_accuracy: 0.5182 - val_loss: 1.3543
Epoch 3/10
[1m7299/7299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.5141 - loss: 1.3576 - val_accuracy: 0.5315 - val_loss: 1.3411
Epoch 4/10
[1m7299/7299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.5223 - loss: 1.3416 - val_accuracy: 0.5310 - val_loss: 1.3318
Epoch 5/10
[1m7299/7299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.5259 - loss: 1.3349 - val_accuracy: 0.5315 - val_loss: 1.3264
Epoch 6/10
[1m7299/7299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 4ms/step - accuracy: 0.5261 - loss: 1.3335 - val_accuracy: 0.5315 - val_loss: 1.3249
Epoch 7/10
[1m7299/7

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
df_mil=pd.read_csv('data/1k_Parts/BAYER_mat_group_flags_20241206_1k‑verified_20241210.csv')

In [24]:
df_mil.head(3)

Unnamed: 0,Product Id,Quality Status,Manufacturer Name,Manufacturer PID,model,Short Description,Long Description,Additional description,score,Product family,...,mfr_match_group,mfr_match_label,script_decision,script_helper,mfr_helper,noun_helper,mfr noun result match,qa_group,qa_label,review
0,61959169,SPARROW_PROCESSING,,00886,,Agitator disc Polyurethan,Agitator disc Polyurethan; ; Agitator disc Pol...,Agitator disc Polyurethan ( 2 Nos.); Machnine ...,,,...,,,Piping Materials,False,,False,Null result present,22104400,"Gas, water and sewage installation",agitator? Stirring maybe? A kind of turbine?
1,61999316,SPARROW_PROCESSING,,BCH16HF07330A5C,BCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C,Canesten/Servo MotorBCH16HF07330A5C; BCH16HF07...,Canesten Bottel line,,,...,,,"Gas, water and sewage installation",False,,False,Null result present,27140000,"Electrical Installation Materials, device",bad extraction
2,61768659,SPARROW_PROCESSING,,BREITNER-ABFÜLLANLAGE,,Motor Antriebsband,Motor Antriebsband; BREITNER-ABFÜLLANLAGE;,,,,...,,,"Gas, water and sewage installation",False,,False,Null result present,23070100,Fasteners,bad extraction


In [25]:
# Cleaning function to standardize the category text format
def clean_category_text(text):
    cleaned_text = str(text).strip().lower()  
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text) 
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text

df_mil['cleaned_noun'] = df_mil['noun'].apply(clean_category_text)

In [None]:
loaded_model = joblib.load('models/text_classification_model_full_catalog.joblib')
new_data = df_mil['cleaned_noun'].tolist()
probabilities = loaded_model.predict_proba(new_data)
probabilities_df = pd.DataFrame(probabilities, columns=loaded_model.classes_)
probabilities_df = probabilities_df.round(2)
df_mil['max_probability'] = probabilities_df.max(axis=1)
df_mil['predicted_category'] = probabilities_df.idxmax(axis=1)

In [None]:
df_mil.qa_label.value_counts()

qa_label
Fasteners                                    335
Electrical Installation Materials, device    195
Valves, Actuator, Fittings                   158
Equipment OEM Spare Parts                    121
Gas, water and sewage installation            97
Piping Materials                              67
Facility consumables                          27
Name: count, dtype: int64

In [None]:
df_mil[df_mil['qa_label']=='Equipment OEM Spare Parts']

Unnamed: 0,Product Id,Quality Status,Manufacturer Name,Manufacturer PID,model,Short Description,Long Description,Additional description,score,Product family,...,script_helper,mfr_helper,noun_helper,mfr noun result match,qa_group,qa_label,review,cleaned_noun,max_probability,predicted_category
7,101039312,SPARROW_PROCESSING,"ATS sortimat USA, LLC",662539,,"RAIL, GUIDE, RAIL,",,,,,...,True,True,,Null result present,99501340,Equipment OEM Spare Parts,bad string match,rail,0.96,Fasteners
14,61869560,SPARROW_PROCESSING,Tectrion,,,ausl.Becherbodenzentrierung,ausl.Becherbodenzentrierung; TECTRION; Benennu...,Benennung: Becherbodenzentrierung; Hersteller:...,,,...,False,False,,Null result present,99501340,Equipment OEM Spare Parts,cup centering device? Part of a chemical device?,becherbodenzentrierung,0.35,Fasteners
15,61782339,SPARROW_PROCESSING,Kilian,213813,,ANILLO OPRESOR;;213813,ANILLO OPRESOR;;213813; KILIAN_213813; Anillo ...,Anillo opresor para disco,,,...,False,,False,Null result present,99501340,Equipment OEM Spare Parts,Engine device - piston clamp,anillo,0.98,Fasteners
45,61900833,SPARROW_IN_PROGRESS,,120014,120014,"KIT:MOTOR SEAL,120014","KIT:MOTOR SEAL,120014; ; KIT: MOTOR SEAL TYPE,...","KIT: MOTOR SEAL TYPE, 120014 MFG P/N; WHERE AP...",6,,...,False,,False,Null result present,99501340,Equipment OEM Spare Parts,Motor?,kitmotor seal,0.95,Fasteners
46,62056024,SPARROW_PROCESSING,,,,ANF hydraulic operated motor of SDV,ANF hydraulic operated motor of SDV; ;,,,,...,False,,False,Null result present,99501340,Equipment OEM Spare Parts,Motor?,motor,1.00,"Valves, Actuator, Fittings"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974,61861741,SPARROW_PROCESSING,Breitner Abfüllanlagen,129318,,Wellenverlängerung V2A 129318,Wellenverlängerung V2A 129318; BREI...,Benennung: Wellenverlängerung; Werkstoff: V2A;...,,,...,True,True,,Null result present,99501340,Equipment OEM Spare Parts,,wellenverlängerung,0.35,Fasteners
975,62038732,SPARROW_PROCESSING,Amixon,10000126,LT24,Widerstandsthermometer LT24 Pt100 170mm,Widerstandsthermometer LT24 Pt100 170mm; AMIXO...,Benennung: Widerstandsthermometer; Typ: LT24; ...,,,...,True,True,,Null result present,99501340,Equipment OEM Spare Parts,,widerstandsthermometer,0.95,"Gas, water and sewage installation"
977,62048164,SPARROW_PROCESSING,Pester,115820,,Pester Wrapping machine/washer/373999,Pester Wrapping machine/washer/373999; 115820;,,,,...,True,True,,Null result present,99501340,Equipment OEM Spare Parts,,wrapping machinewasher,0.35,Fasteners
978,62048030,SPARROW_PROCESSING,amixon GmbH,,,Zahnkranz Pos.77,Zahnkranz Pos.77; AMIX...,Benennung: Zahnkranz; für: Getriebe KS 6000.3-...,,,...,True,True,,Null result present,99501340,Equipment OEM Spare Parts,,zahnkranz,0.35,Fasteners


In [None]:
df_mil['qa_label'] = df_mil['qa_label'].astype(str)  # or 'int' depending on your data type
df_mil['predicted_category'] = df_mil['predicted_category'].astype(str)  # or 'int'
accuracy = accuracy_score(df_mil['qa_label'], df_mil['predicted_category'])
report = classification_report(df_mil['qa_label'], df_mil['predicted_category'])
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)