In [1]:
import numpy as np
import pandas as pd
import transformers
import torch
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch import nn
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('/kaggle/input/forcdtaaa/train.csv')
data.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
data = data.dropna()
data["text"] = data["title"] + data["abstract"]
data

Unnamed: 0,abstract,author,title,label,text
0,the production of b jets in association with a...,CMS Collaboration,Measurement of the Z/gamma* + b-jet cross sect...,Physics,Measurement of the Z/gamma* + b-jet cross sect...
1,instabilities in the price dynamics of a large...,"Giacomo Bormetti, Lucio Maria Calcagnile, Mich...",Modelling systemic price cojumps with Hawkes f...,Quantitative Finance,Modelling systemic price cojumps with Hawkes f...
2,large information sizes in samples and feature...,"David Banh, Alan Huang",Encoding large information structures in linea...,Machine Learning,Encoding large information structures in linea...
3,we consider polygonal billiards with collision...,"Gianluigi Del Magno, Jo\~ao Lopes Dias, Pedro ...",Hyperbolic polygonal billiards close to 1-dime...,Dynamics/Dynamical Systems,Hyperbolic polygonal billiards close to 1-dime...
4,Bauxite deposits of Jharkhand in India are res...,"['E.N. Dhanamjaya Rao', 'A.T. Jeyaseelan', 'K....",analysis of aster data for mapping bauxite ric...,Sociology,analysis of aster data for mapping bauxite ric...
...,...,...,...,...,...
41534,programs offered by academic institutions in h...,"['Alex Ferworn', 'Muthana Zouri']",an ontology-based approach for curriculum mapp...,Computer Engineering,an ontology-based approach for curriculum mapp...
41535,this research addresses the competencies organ...,"['Sabik Khan', 'Marcus Ho', 'Kamrul Ahsan']",recruiting project managers: a comparative ana...,Sociology,recruiting project managers: a comparative ana...
41536,this paper studies an optimal stopping problem...,Diana Dorobantu (LSProba),Optimal stopping for L\'evy processes and affi...,Statistics and Probability,Optimal stopping for L\'evy processes and affi...
41537,we examine the possible extension of the param...,"John Ellis, Joel Giedt, Oleg Lebedev, Keith Ol...",Against Tachyophobia,Physics,Against Tachyophobiawe examine the possible ex...


In [3]:
def NLP_cleaning(text):
    text_corpus = []
    i=0
    for sent in tqdm(text, desc='Cleaning'):
        # print(i, end =" ")
        i+=1
        sent = re.sub('<[^>]*>', '', sent)
        sent = re.sub('[^a-zA-z0-9]', ' ', sent)
        sent = sent.lower()
        text_corpus.append(sent)

    return text_corpus

In [4]:
text = data.text.values.tolist()
text_corpus = NLP_cleaning(text)
data['text'] = text_corpus
data['title'] = NLP_cleaning(data.title.values.tolist())
data['author'] = NLP_cleaning(data.author.values.tolist())

Cleaning: 100%|██████████| 40332/40332 [00:03<00:00, 11839.25it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 105971.39it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 81643.26it/s]


In [5]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['label_number'] = label_encoder.fit_transform(data['label'])
data

Unnamed: 0,abstract,author,title,label,text,label_number
0,the production of b jets in association with a...,cms collaboration,measurement of the z gamma b jet cross sect...,Physics,measurement of the z gamma b jet cross sect...,95
1,instabilities in the price dynamics of a large...,giacomo bormetti lucio maria calcagnile mich...,modelling systemic price cojumps with hawkes f...,Quantitative Finance,modelling systemic price cojumps with hawkes f...,105
2,large information sizes in samples and feature...,david banh alan huang,encoding large information structures in linea...,Machine Learning,encoding large information structures in linea...,67
3,we consider polygonal billiards with collision...,gianluigi del magno jo\ ao lopes dias pedro ...,hyperbolic polygonal billiards close to 1 dime...,Dynamics/Dynamical Systems,hyperbolic polygonal billiards close to 1 dime...,41
4,Bauxite deposits of Jharkhand in India are res...,[ e n dhanamjaya rao a t jeyaseelan k ...,analysis of aster data for mapping bauxite ric...,Sociology,analysis of aster data for mapping bauxite ric...,114
...,...,...,...,...,...,...
41534,programs offered by academic institutions in h...,[ alex ferworn muthana zouri ],an ontology based approach for curriculum mapp...,Computer Engineering,an ontology based approach for curriculum mapp...,25
41535,this research addresses the competencies organ...,[ sabik khan marcus ho kamrul ahsan ],recruiting project managers a comparative ana...,Sociology,recruiting project managers a comparative ana...,114
41536,this paper studies an optimal stopping problem...,diana dorobantu lsproba,optimal stopping for l\ evy processes and affi...,Statistics and Probability,optimal stopping for l\ evy processes and affi...,118
41537,we examine the possible extension of the param...,john ellis joel giedt oleg lebedev keith ol...,against tachyophobia,Physics,against tachyophobiawe examine the possible ex...,95


In [6]:
val_df = pd.read_csv('/kaggle/input/forcdtaaa/val.csv')
val_df.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
val_df = val_df.dropna()
val_df['label_number'] = label_encoder.transform(val_df['label'])
val_df["text"] = val_df["title"] + val_df["abstract"]
val_df['title'] = NLP_cleaning(val_df.title.values.tolist())
val_df['author'] = NLP_cleaning(val_df.author.values.tolist())
val_df['abstract'] = NLP_cleaning(val_df.abstract.values.tolist())
val_df['text'] = NLP_cleaning(val_df.text.values.tolist())

Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 97780.97it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 82819.99it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 12323.08it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 11664.16it/s]


In [7]:
!pip install -q sentence-transformers

In [8]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Proper Experiment

In [9]:
X_train = data['text']
X_train = X_train.to_list()
Y_train = data['label_number']
Y_train = Y_train.to_list()

In [10]:
X_test = val_df['text'].to_list()
Y_test = val_df['label_number'].to_list()

In [None]:
train_embeddings = model.encode(X_train)
test_embeddings = model.encode(X_test)

Batches:   0%|          | 0/1261 [00:00<?, ?it/s]

In [21]:
len(train_embeddings)

40332

## SVM

In [15]:
from sklearn.svm import SVC
svm = SVC()

svm.fit(train_embeddings,Y_train)
y_pred = svm.predict(test_embeddings)

In [16]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ",accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.7119565217391305
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.72      0.73      0.72        99
           1       0.80      0.80      0.80       126
           2       0.64      0.70      0.67       130
           3       0.86      1.00      0.92        12
           4       0.62      0.52      0.56       118
           5       0.43      0.20      0.27        30
           6       0.61      0.50      0.55       104
           7       0.71      0.56      0.63         9
           8       0.52      0.71      0.60       557
           9       0.60      0.27      0.37        11
          10       0.68      0.68      0.68       225
          11       0.00      0.00      0.00         3
          12       0.70      0.92      0.80       189
          13       0.70      0.33      0.45        42
          14       0.00      0.00      0.00         3
          15       0.83      0.56      0.67         9
 

## Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_embeddings, Y_train)

y_pred = rf.predict(test_embeddings)

In [39]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ",accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.5712303422756707

-----------------------------------------------





              precision    recall  f1-score   support



           0       0.55      0.63      0.58        99

           1       0.77      0.70      0.73       126

           2       0.58      0.48      0.53       130

           3       0.88      0.58      0.70        12

           4       0.53      0.15      0.24       118

           5       1.00      0.03      0.06        30

           6       0.46      0.28      0.35       104

           7       0.00      0.00      0.00         9

           8       0.42      0.83      0.56       557

           9       0.00      0.00      0.00        11

          10       0.72      0.45      0.55       225

          11       0.00      0.00      0.00         3

          12       0.46      0.86      0.60       189

          13       0.00      0.00      0.00        42

          14       0.00      0.00      0.00         3

          15       0.00      0.00 

## Neural Network

## Model 1

In [25]:
import tensorflow as tf

input_dim = 384
num_classes = 123

model1 = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])
model1.compile(optimizer='adam',
              loss= 'sparse_categorical_crossentropy',
             metrics=['accuracy'])

model1.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 256)               98560     
                                                                 
 dropout_16 (Dropout)        (None, 256)               0         
                                                                 
 dense_21 (Dense)            (None, 512)               131584    
                                                                 
 dropout_17 (Dropout)        (None, 512)               0         
                                                                 
 dense_22 (Dense)            (None, 256)               131328    
                                                                 
 dropout_18 (Dropout)        (None, 256)               0         
                                                                 
 dense_23 (Dense)            (None, 128)              

In [26]:
Y_train = np.array(Y_train)

train_accuracy = []
train_loss = []

epochs = 10
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    history = model1.fit(train_embeddings, Y_train, batch_size=16)
    train_loss.append(history.history['loss'][0])
    train_acc = history.history['accuracy'][0]
    train_accuracy.append(train_acc)
    print(f"Training Loss: {train_loss[-1]}, Training Accuracy: {train_acc}")


Epoch 1/10


I0000 00:00:1708419706.347542     210 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Training Loss: 1.9812408685684204, Training Accuracy: 0.46756917238235474

Epoch 2/10
Training Loss: 1.413461685180664, Training Accuracy: 0.5915154218673706

Epoch 3/10
Training Loss: 1.2594672441482544, Training Accuracy: 0.6250371932983398

Epoch 4/10
Training Loss: 1.152707576751709, Training Accuracy: 0.6481453776359558

Epoch 5/10
Training Loss: 1.0768930912017822, Training Accuracy: 0.6685758233070374

Epoch 6/10
Training Loss: 1.0138516426086426, Training Accuracy: 0.6847168207168579

Epoch 7/10
Training Loss: 0.9642525911331177, Training Accuracy: 0.6950560212135315

Epoch 8/10
Training Loss: 0.913070559501648, Training Accuracy: 0.7100813388824463

Epoch 9/10
Training Loss: 0.8776075839996338, Training Accuracy: 0.7174699902534485

Epoch 10/10
Training Loss: 0.8465852737426758, Training Accuracy: 0.7267430424690247


In [27]:
y_pred_prob = model1.predict(test_embeddings)
y_pred = np.argmax(y_pred_prob, axis=1)



In [28]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.6653561517113784
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.67      0.65      0.66        99
           1       0.73      0.82      0.77       126
           2       0.66      0.62      0.63       130
           3       0.79      0.92      0.85        12
           4       0.60      0.44      0.51       118
           5       0.35      0.23      0.28        30
           6       0.50      0.31      0.38       104
           7       0.22      0.22      0.22         9
           8       0.55      0.38      0.45       557
           9       0.67      0.36      0.47        11
          10       0.59      0.70      0.64       225
          11       0.00      0.00      0.00         3
          12       0.71      0.87      0.78       189
          13       0.64      0.17      0.26        42
          14       0.00      0.00      0.00         3
          15       0.20      0.22      0.21         9
 

## Model 2

In [29]:
input_dim = 384
num_classes = 123

model2 = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model2.compile(optimizer='adam',
              loss= 'sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model2.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_25 (Dense)            (None, 256)               98560     
                                                                 
 dropout_20 (Dropout)        (None, 256)               0         
                                                                 
 dense_26 (Dense)            (None, 512)               131584    
                                                                 
 dropout_21 (Dropout)        (None, 512)               0         
                                                                 
 dense_27 (Dense)            (None, 1024)              525312    
                                                                 
 dropout_22 (Dropout)        (None, 1024)              0         
                                                                 
 dense_28 (Dense)            (None, 512)              

In [30]:
history = model2.fit(train_embeddings, Y_train, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
y_pred_prob = model2.predict(test_embeddings)
y_pred = np.argmax(y_pred_prob, axis=1)



In [34]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.6598057354301573
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.59      0.64      0.61        99
           1       0.78      0.74      0.76       126
           2       0.57      0.66      0.61       130
           3       0.75      0.75      0.75        12
           4       0.51      0.41      0.45       118
           5       0.23      0.23      0.23        30
           6       0.38      0.43      0.41       104
           7       0.27      0.33      0.30         9
           8       0.58      0.44      0.50       557
           9       0.08      0.18      0.11        11
          10       0.67      0.67      0.67       225
          11       0.00      0.00      0.00         3
          12       0.75      0.84      0.79       189
          13       0.44      0.45      0.45        42
          14       0.00      0.00      0.00         3
          15       0.50      0.22      0.31         9
 

In [35]:
## Model 3
model3 = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(2016, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1280, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model3.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_30 (Dense)            (None, 512)               197120    
                                                                 
 dropout_24 (Dropout)        (None, 512)               0         
                                                                 
 dense_31 (Dense)            (None, 1024)              525312    
                                                                 
 dropout_25 (Dropout)        (None, 1024)              0         
                                                                 
 dense_32 (Dense)            (None, 2016)              2066400   
                                                                 
 dropout_26 (Dropout)        (None, 2016)              0         
                                                                 
 dense_33 (Dense)            (None, 1280)             

In [36]:
history = model3.fit(train_embeddings, Y_train, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [37]:
y_pred_prob = model3.predict(test_embeddings)
y_pred = np.argmax(y_pred_prob, axis=1)

from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.6675531914893617
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.70      0.71      0.70        99
           1       0.80      0.71      0.76       126
           2       0.60      0.65      0.63       130
           3       0.85      0.92      0.88        12
           4       0.55      0.42      0.48       118
           5       0.25      0.20      0.22        30
           6       0.58      0.27      0.37       104
           7       0.25      0.11      0.15         9
           8       0.52      0.62      0.56       557
           9       0.38      0.27      0.32        11
          10       0.62      0.69      0.66       225
          11       1.00      0.33      0.50         3
          12       0.83      0.80      0.81       189
          13       0.54      0.52      0.53        42
          14       0.00      0.00      0.00         3
          15       0.50      0.44      0.47         9
 

## XGBOOST

In [70]:
import xgboost as xgb

dtrain = xgb.DMatrix(train_embeddings, label=Y_train)
dtest = xgb.DMatrix(test_embeddings, label=Y_test)

# Define XGBoost parameters
param = {
    'objective': 'multi:softmax',
    'num_class': 123,
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'merror'
}

# Train the XGBoost model
num_round = 100
bst = xgb.train(param, dtrain, num_round)

# Predict the labels of the test set
xg_y_pred = bst.predict(dtest)

In [71]:
xg_y_pred

array([ 51., 107.,  68., ...,  84.,  27.,  77.], dtype=float32)

In [75]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(xg_y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, xg_y_pred))

Accuracy:  0.5883441258094357

-----------------------------------------------





              precision    recall  f1-score   support



           0       0.55      0.47      0.51        99

           1       0.65      0.67      0.66       126

           2       0.59      0.55      0.57       130

           3       0.82      0.75      0.78        12

           4       0.44      0.39      0.41       118

           5       0.20      0.13      0.16        30

           6       0.36      0.35      0.35       104

           7       0.60      0.33      0.43         9

           8       0.47      0.60      0.52       557

           9       0.25      0.09      0.13        11

          10       0.56      0.49      0.52       225

          11       0.50      0.33      0.40         3

          12       0.63      0.79      0.70       189

          13       0.47      0.19      0.27        42

          14       0.00      0.00      0.00         3

          15       0.38      0.33 

In [38]:
## final model
from sklearn.svm import SVC
svm = SVC(C=1.5, kernel = 'poly')

svm.fit(train_embeddings,Y_train)
y_pred = svm.predict(test_embeddings)

In [39]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.7161193339500462
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.74      0.71      0.72        99
           1       0.79      0.79      0.79       126
           2       0.63      0.69      0.66       130
           3       0.92      1.00      0.96        12
           4       0.68      0.57      0.62       118
           5       0.27      0.13      0.18        30
           6       0.56      0.52      0.54       104
           7       1.00      0.44      0.62         9
           8       0.56      0.69      0.62       557
           9       1.00      0.18      0.31        11
          10       0.71      0.70      0.70       225
          11       0.00      0.00      0.00         3
          12       0.63      0.93      0.75       189
          13       0.73      0.38      0.50        42
          14       0.00      0.00      0.00         3
          15       0.75      0.67      0.71         9
 

In [14]:
final_model = svm

In [15]:
import pickle

with open('final_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)

In [41]:
import tensorflow as tf

input_dim = 384
num_classes = 123
y_train_one_hot = tf.keras.utils.to_categorical(Y_train, num_classes)


def focal_loss(y_true, y_pred, gamma=2.0, alpha=0.25):
    epsilon = 1e-8
    y_pred = tf.clip_by_value(y_pred, epsilon, 1.0 - epsilon)
    pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
    focal_loss = -tf.reduce_sum(alpha * tf.pow(1.0 - pt, gamma) * tf.math.log(pt))
    return focal_loss

model4 = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])
model4.compile(optimizer='adam',
              loss= focal_loss,
             metrics=['accuracy'])

model4.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_37 (Dense)            (None, 256)               98560     
                                                                 
 dropout_30 (Dropout)        (None, 256)               0         
                                                                 
 dense_38 (Dense)            (None, 512)               131584    
                                                                 
 dropout_31 (Dropout)        (None, 512)               0         
                                                                 
 dense_39 (Dense)            (None, 256)               131328    
                                                                 
 dropout_32 (Dropout)        (None, 256)               0         
                                                                 
 dense_40 (Dense)            (None, 128)              

In [42]:
history = model4.fit(train_embeddings, y_train_one_hot, epochs=25, batch_size=16)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [43]:
y_pred_prob = model4.predict(test_embeddings)
y_pred = np.argmax(y_pred_prob, axis=1)

from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.651827012025902
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.72      0.21      0.33        99
           1       0.72      0.75      0.74       126
           2       0.58      0.65      0.62       130
           3       0.79      0.92      0.85        12
           4       0.46      0.56      0.51       118
           5       0.27      0.13      0.18        30
           6       0.42      0.26      0.32       104
           7       0.38      0.56      0.45         9
           8       0.52      0.52      0.52       557
           9       0.67      0.18      0.29        11
          10       0.61      0.71      0.65       225
          11       0.00      0.00      0.00         3
          12       0.76      0.84      0.80       189
          13       0.51      0.45      0.48        42
          14       0.00      0.00      0.00         3
          15       0.46      0.67      0.55         9
  

In [44]:
##------------------------------------------------------------------------------------------------------------------------------

In [57]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=10)
knn_classifier.fit(train_embeddings, Y_train)
y_pred = knn_classifier.predict(test_embeddings)

In [49]:
y_pred

array([ 33, 107,  10, ...,  10,  27,  77])

In [58]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.6983117483811286
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.72      0.69      0.70        99
           1       0.76      0.81      0.78       126
           2       0.65      0.66      0.65       130
           3       0.80      1.00      0.89        12
           4       0.53      0.53      0.53       118
           5       0.18      0.10      0.13        30
           6       0.53      0.48      0.50       104
           7       0.60      0.33      0.43         9
           8       0.50      0.64      0.56       557
           9       0.00      0.00      0.00        11
          10       0.64      0.73      0.68       225
          11       0.00      0.00      0.00         3
          12       0.76      0.88      0.82       189
          13       0.59      0.38      0.46        42
          14       0.00      0.00      0.00         3
          15       0.60      0.67      0.63         9
 

In [59]:
##-------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
grouped = data.groupby('label')

# Initialize an empty list to store the sampled DataFrames
sampled_dfs = []

# Iterate over each group
for _, group_df in grouped:
    # Sample up to 50 entries from each group
    sampled_df = group_df.sample(min(50, len(group_df)))
    # Append the sampled DataFrame to the list
    sampled_dfs.append(sampled_df)

# Concatenate the sampled DataFrames into a single DataFrame
sampled_df = pd.concat(sampled_dfs)

# Shuffle the DataFrame
sampled_df = sampled_df.sample(frac=1).reset_index(drop=True)
final = sampled_df
final

In [None]:
final['label'].nunique()

In [22]:
test_df = pd.read_csv("/kaggle/input/hahhasjsjsj/test.csv")
test_df

Unnamed: 0,abstract,author,doi,url,publication month,publication year,title,publisher,data_index
0,the paper introduces the notion of a locally c...,Bartosz Zielinski,,,,,Locally coalgebra-Galois extensions,,22901
1,a bipartite state which is secretly chosen fro...,"Yangjia Li, Runyao Duan, and Mingsheng Ying",10.1103/PhysRevA.82.032339,,,,Local Unambiguous Discrimination with Remainin...,,47248
2,this short paper addresses the open problems l...,F\'elix Bou and Carles Noguera,,,,,Solution of some problems in the arithmetical ...,,27601
3,we find the symmetry algebras of cosets which ...,"Dushyant Kumar, Menika Sharma",10.1007/JHEP08(2019)179,,,,Symmetry Algebras of Stringy Cosets,,36511
4,we present the results of the analysis of the ...,"E. Jimenez-Bailon (Universita Roma Tre), M. Sa...",10.1051/0004-6361:20065566,,,,XMM-Newton view of the double-peaked Fe K-alph...,,15483
...,...,...,...,...,...,...,...,...,...
8898,background studies show that the u.s. foreign-...,"['James W. Vaupel', 'Danan Gu', 'Matthew E. Du...",10.1371/journal.pone.0037177,,5.0,2012.0,survival differences among native-born and for...,PLoS ONE,43679
8899,we prove uniqueness for the vortex-wave system...,"Christophe Lacave (ICJ), Evelyne Miot (LJLL)",,,,,Uniqueness for the vortex-wave system when the...,,41425
8900,we demonstrate sensitivity of the mn 3d valenc...,"K. W. Edmonds, G. van der Laan, N. R. S. Farle...",10.1103/PhysRevB.77.113205,,,,Strain dependence of the Mn anisotropy in ferr...,,46483
8901,the coexisting regime of spin density wave (sd...,"Q. Q. Ge, Z. R. Ye, M. Xu, Y. Zhang, J. Jiang,...",10.1103/PhysRevX.3.011020,,,,Anisotropic but nodeless superconducting gap i...,"Phys. Rev. X 3, 011020 (2013)",53461


In [23]:
test_df.isnull().sum()

abstract              217
author                 27
doi                  3712
url                  8498
publication month    7499
publication year     7496
title                   0
publisher            5148
data_index              0
dtype: int64

In [24]:
test_df.drop(['doi','url','publication month', 'publication year','publisher', 'author'], axis =1 , inplace = True)
test_df.fillna('', inplace=True)
test_df['text'] = test_df['title'] + test_df['abstract']
test_df['text'] = NLP_cleaning(test_df.text.values.tolist())
test_df = test_df[['text', 'data_index']]
test_df

Cleaning: 100%|██████████| 8903/8903 [00:00<00:00, 12073.50it/s]


Unnamed: 0,text,data_index
0,locally coalgebra galois extensionsthe paper i...,22901
1,local unambiguous discrimination with remainin...,47248
2,solution of some problems in the arithmetical ...,27601
3,symmetry algebras of stringy cosetswe find the...,36511
4,xmm newton view of the double peaked fe k alph...,15483
...,...,...
8898,survival differences among native born and for...,43679
8899,uniqueness for the vortex wave system when the...,41425
8900,strain dependence of the mn anisotropy in ferr...,46483
8901,anisotropic but nodeless superconducting gap i...,53461


In [25]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [26]:
input = test_df['text'].tolist()
input = model.encode(input)

Batches:   0%|          | 0/279 [00:00<?, ?it/s]

In [27]:
preds = final_model.predict(input)

In [28]:
preds

array([  0, 107, 120, ...,  68,  73,  35])

In [29]:
preds = label_encoder.inverse_transform(preds)

In [30]:
submission = pd.DataFrame()
submission['data_index'] = test_df['data_index']
submission['target'] = preds
submission

Unnamed: 0,data_index,target
0,22901,Algebra
1,47248,Quantum Physics
2,27601,Theory/Algorithms
3,36511,Quantum Physics
4,15483,Astrophysics and Astronomy
...,...,...
8898,43679,Bioinformatics
8899,41425,Dynamics/Dynamical Systems
8900,46483,Materials Science and Engineering
8901,53461,Mechanics of Materials


In [31]:
import zipfile
submission.to_csv('predictions.csv')
zipfile.ZipFile('predictions.zip', mode='w').write("predictions.csv")