In [1]:
import pandas as pd
excel_files = ["batteryManagement.csv", "accelerometer.csv", "gyroscope.csv","multivibrators.csv","networkdevices.csv","motors.csv","microcontroller.csv"]
dfs = []
for file in excel_files:
    df = pd.read_csv(file)
    df['category']=file.split(".")[0]
    dfs.append(df)
merged_df = pd.concat(dfs, ignore_index=True)

merged_df

Unnamed: 0,product_name,manufacturer,cleaned_datasheet_text,category
0,ADBMS6948,Analog Devices,preliminary data measures up to battery cells ...,batteryManagement
1,ADBMS6830,Analog Devices,data multicell battery monitor typical applica...,batteryManagement
2,ADRF8800,Analog Devices,data low power ghz wireless system on chip low...,batteryManagement
3,ADBMS6816,Analog Devices,typical application circuit data multicell bat...,batteryManagement
4,LTC2959,Analog Devices,power battery gas gauge descriptionthe is an p...,batteryManagement
...,...,...,...,...
900,ADUC836,Analog Devices,dual adcs with embedded kb flash mcu featuresh...,microcontroller
901,ADUC814,Analog Devices,small adc with embedded flash analog ksps adc ...,microcontroller
902,ADUC816,Analog Devices,microconverter adcs with embedded flash mcu ad...,microcontroller
903,ADUC824,Analog Devices,microconverter adcs with embedded flash mcu fe...,microcontroller


In [2]:
merged_df.dropna(inplace=True)

In [7]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

# Load SciBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')


In [30]:
def get_scibert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=256, padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Extract embeddings from the [CLS] token
    return embeddings

In [31]:
merged_df['embeddings'] = merged_df['cleaned_datasheet_text'].apply(get_scibert_embeddings)

In [25]:
import numpy as np
X = np.array(merged_df['embeddings'].tolist())  # Convert list of arrays to a 2D NumPy array
X_flattened = np.array([embedding.flatten() for embedding in X])
y = merged_df['category']  # Assuming 'category' contains the labels


In [32]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [36]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
X_train, X_test, y_train, y_test = train_test_split(X_flattened, y_encoded, test_size=0.1, random_state=42)


In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred)
print(report)

Random Forest Accuracy: 0.8461538461538461
              precision    recall  f1-score   support

           0       0.75      0.88      0.81        24
           1       0.88      0.94      0.91        16
           2       1.00      0.50      0.67         4
           3       1.00      0.67      0.80         3
           4       1.00      0.50      0.67        10
           5       1.00      0.67      0.80         3
           6       0.86      0.97      0.91        31

    accuracy                           0.85        91
   macro avg       0.93      0.73      0.79        91
weighted avg       0.86      0.85      0.84        91



In [38]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred)
print(report)

SVM Accuracy: 0.9340659340659341
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        24
           1       0.94      1.00      0.97        16
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         3
           4       0.73      0.80      0.76        10
           5       1.00      1.00      1.00         3
           6       0.94      0.94      0.94        31

    accuracy                           0.93        91
   macro avg       0.94      0.95      0.95        91
weighted avg       0.94      0.93      0.94        91

