In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch
# local dependencies 
from contrastive import Contrastive_loss
from utils import plot_tsne,Net_embed

In [2]:


# Define the categories we want to classify
categories = ['sci.space', 'sci.med','sci.electronics','comp.os.ms-windows.misc'
              ,'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware']

# Fetch the training dataset
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)

# Fetch the testing dataset
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

In [3]:
X_train,X_dump,y_train,y_dump =train_test_split(
    newsgroups_train.data,newsgroups_train.target,train_size=0.25,random_state=0)


X_test,X_dump,y_test,y_dump = train_test_split(
    newsgroups_test.data,newsgroups_test.target,train_size=0.25,random_state=0)

In [12]:


# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# Create a logistic regression classifier
clf = LogisticRegression()

# Train the model
clf.fit(X_train_tfidf, y_train)


# Make predictions on the test data
predicted = clf.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
report = classification_report(y_test, predicted, target_names=newsgroups_test.target_names)
print("Classification Report:\n", report)

Accuracy: 0.70
Classification Report:
                           precision    recall  f1-score   support

 comp.os.ms-windows.misc       0.73      0.75      0.74        99
comp.sys.ibm.pc.hardware       0.67      0.49      0.57        99
   comp.sys.mac.hardware       0.74      0.71      0.73       104
         sci.electronics       0.53      0.70      0.60        97
                 sci.med       0.71      0.63      0.67        94
               sci.space       0.84      0.91      0.87        95

                accuracy                           0.70       588
               macro avg       0.70      0.70      0.70       588
            weighted avg       0.70      0.70      0.70       588



In [5]:

model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
embedding_mat_train = np.zeros((len(X_train),768))
embedding_mat_test = np.zeros((len(X_test),768))

for i in tqdm(range(len(X_train))):

    embeddings = model.encode(X_train[i])
    embedding_mat_train[i,:] = embeddings
    
for i in tqdm(range(len(X_test))):

    embeddings = model.encode(X_test[i])
    embedding_mat_test[i,:] = embeddings


100%|████████████████████████████████████████████████████████████████████████████████| 884/884 [34:07<00:00,  2.32s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 588/588 [13:38<00:00,  1.39s/it]


In [15]:
clf = KNeighborsClassifier(n_neighbors=3)
clf = LogisticRegression(max_iter=10000)
# Train the model
clf.fit(embedding_mat_train, y_train)
# Make predictions on the test data
predicted = clf.predict(embedding_mat_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
report = classification_report(y_test, predicted, target_names=newsgroups_test.target_names)
print("Classification Report:\n", report)

Accuracy: 0.80
Classification Report:
                           precision    recall  f1-score   support

 comp.os.ms-windows.misc       0.81      0.81      0.81        99
comp.sys.ibm.pc.hardware       0.66      0.64      0.65        99
   comp.sys.mac.hardware       0.71      0.74      0.72       104
         sci.electronics       0.74      0.75      0.74        97
                 sci.med       0.95      0.96      0.95        94
               sci.space       0.99      0.95      0.97        95

                accuracy                           0.80       588
               macro avg       0.81      0.81      0.81       588
            weighted avg       0.81      0.80      0.80       588



In [13]:
# train using deep contrastive learnig  
net = Net_embed(input_dim=embedding_mat_train.shape[1],hidden_dim=256,out_dim=64,drop_prob=0.4)

N_epoch = 15
batches_per_epoch = 750
N_samp_batch = 36
N_data = embedding_mat_train.shape[0]
optimizer = torch.optim.Adam(net.parameters(),lr=0.001)

net.train()
embedding_train_torch = torch.from_numpy(embedding_mat_train).float()
label_train_torch = torch.from_numpy(y_train)

criterion = Contrastive_loss(margin=0.2)
for n in range(N_epoch):
    batch_loss = 0
    for i in range(batches_per_epoch) :
        optimizer.zero_grad()

        # select random batch
        data_samp = np.random.choice(N_data,N_samp_batch,replace=False)
        
        data_b = embedding_train_torch[data_samp,:]
        labels_b = label_train_torch[data_samp]
        pred = net(data_b)
        loss = criterion(pred,labels_b)
        loss.backward()
        optimizer.step()
        batch_loss += float(loss)
    
    print("Epoch: {}, loss:{:.3f}".format(n,batch_loss))

 epoch: 0, Loss:11.589
 epoch: 1, Loss:2.051
 epoch: 2, Loss:1.142
 epoch: 3, Loss:0.832
 epoch: 4, Loss:0.577
 epoch: 5, Loss:0.499
 epoch: 6, Loss:0.409
 epoch: 7, Loss:0.366
 epoch: 8, Loss:0.417
 epoch: 9, Loss:0.340
 epoch: 10, Loss:0.225
 epoch: 11, Loss:0.301
 epoch: 12, Loss:0.212
 epoch: 13, Loss:0.168
 epoch: 14, Loss:0.292


In [16]:
# calculate test data embedding and Tsne
net.eval()
embedding_out_test = net(torch.from_numpy(embedding_mat_test).float()).detach().numpy()

embedding_out_train = net(torch.from_numpy(embedding_mat_train).float()).detach().numpy()

clf = LogisticRegression(max_iter=10000)
# Train the model
clf.fit(pred_train, y_train)
# Make predictions on the test data
predicted = clf.predict(pred_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
report = classification_report(y_test, predicted, target_names=newsgroups_test.target_names)
print("Classification Report:\n", report)

Accuracy: 0.86
Classification Report:
                           precision    recall  f1-score   support

 comp.os.ms-windows.misc       0.88      0.85      0.86        99
comp.sys.ibm.pc.hardware       0.76      0.82      0.79        99
   comp.sys.mac.hardware       0.76      0.80      0.78       104
         sci.electronics       0.85      0.81      0.83        97
                 sci.med       0.98      0.96      0.97        94
               sci.space       0.98      0.95      0.96        95

                accuracy                           0.86       588
               macro avg       0.87      0.86      0.87       588
            weighted avg       0.87      0.86      0.86       588

