In [1]:
import numpy as np
import pandas as pd
import transformers
import torch
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch import nn
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('/content/train.csv')
data.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
data = data.dropna()
data["text"] = data["title"] + data["abstract"]
data

Unnamed: 0,abstract,author,title,label,text
0,the production of b jets in association with a...,CMS Collaboration,Measurement of the Z/gamma* + b-jet cross sect...,Physics,Measurement of the Z/gamma* + b-jet cross sect...
1,instabilities in the price dynamics of a large...,"Giacomo Bormetti, Lucio Maria Calcagnile, Mich...",Modelling systemic price cojumps with Hawkes f...,Quantitative Finance,Modelling systemic price cojumps with Hawkes f...
2,large information sizes in samples and feature...,"David Banh, Alan Huang",Encoding large information structures in linea...,Machine Learning,Encoding large information structures in linea...
3,we consider polygonal billiards with collision...,"Gianluigi Del Magno, Jo\~ao Lopes Dias, Pedro ...",Hyperbolic polygonal billiards close to 1-dime...,Dynamics/Dynamical Systems,Hyperbolic polygonal billiards close to 1-dime...
4,Bauxite deposits of Jharkhand in India are res...,"['E.N. Dhanamjaya Rao', 'A.T. Jeyaseelan', 'K....",analysis of aster data for mapping bauxite ric...,Sociology,analysis of aster data for mapping bauxite ric...
...,...,...,...,...,...
41534,programs offered by academic institutions in h...,"['Alex Ferworn', 'Muthana Zouri']",an ontology-based approach for curriculum mapp...,Computer Engineering,an ontology-based approach for curriculum mapp...
41535,this research addresses the competencies organ...,"['Sabik Khan', 'Marcus Ho', 'Kamrul Ahsan']",recruiting project managers: a comparative ana...,Sociology,recruiting project managers: a comparative ana...
41536,this paper studies an optimal stopping problem...,Diana Dorobantu (LSProba),Optimal stopping for L\'evy processes and affi...,Statistics and Probability,Optimal stopping for L\'evy processes and affi...
41537,we examine the possible extension of the param...,"John Ellis, Joel Giedt, Oleg Lebedev, Keith Ol...",Against Tachyophobia,Physics,Against Tachyophobiawe examine the possible ex...


In [99]:
def NLP_cleaning(text):
    text_corpus = []
    i=0
    for sent in tqdm(text, desc='Cleaning'):
        # print(i, end =" ")
        i+=1
        sent = re.sub('<[^>]*>', '', sent)
        sent = re.sub('[^a-zA-z0-9]', ' ', sent)
        sent = sent.lower()
        text_corpus.append(sent)

    return text_corpus

In [5]:
text = data.text.values.tolist()
text_corpus = NLP_cleaning(text)
data['text'] = text_corpus
data['title'] = NLP_cleaning(data.title.values.tolist())
data['author'] = NLP_cleaning(data.author.values.tolist())

Cleaning: 100%|██████████| 40332/40332 [00:01<00:00, 20856.11it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 182919.88it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 136383.20it/s]


In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['label_number'] = label_encoder.fit_transform(data['label'])
data

Unnamed: 0,abstract,author,title,label,text,label_number
0,the production of b jets in association with a...,cms collaboration,measurement of the z gamma b jet cross sect...,Physics,measurement of the z gamma b jet cross sect...,95
1,instabilities in the price dynamics of a large...,giacomo bormetti lucio maria calcagnile mich...,modelling systemic price cojumps with hawkes f...,Quantitative Finance,modelling systemic price cojumps with hawkes f...,105
2,large information sizes in samples and feature...,david banh alan huang,encoding large information structures in linea...,Machine Learning,encoding large information structures in linea...,67
3,we consider polygonal billiards with collision...,gianluigi del magno jo\ ao lopes dias pedro ...,hyperbolic polygonal billiards close to 1 dime...,Dynamics/Dynamical Systems,hyperbolic polygonal billiards close to 1 dime...,41
4,Bauxite deposits of Jharkhand in India are res...,[ e n dhanamjaya rao a t jeyaseelan k ...,analysis of aster data for mapping bauxite ric...,Sociology,analysis of aster data for mapping bauxite ric...,114
...,...,...,...,...,...,...
41534,programs offered by academic institutions in h...,[ alex ferworn muthana zouri ],an ontology based approach for curriculum mapp...,Computer Engineering,an ontology based approach for curriculum mapp...,25
41535,this research addresses the competencies organ...,[ sabik khan marcus ho kamrul ahsan ],recruiting project managers a comparative ana...,Sociology,recruiting project managers a comparative ana...,114
41536,this paper studies an optimal stopping problem...,diana dorobantu lsproba,optimal stopping for l\ evy processes and affi...,Statistics and Probability,optimal stopping for l\ evy processes and affi...,118
41537,we examine the possible extension of the param...,john ellis joel giedt oleg lebedev keith ol...,against tachyophobia,Physics,against tachyophobiawe examine the possible ex...,95


In [7]:
val_df = pd.read_csv('/content/val.csv')
val_df.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
val_df = val_df.dropna()
val_df['label_number'] = label_encoder.transform(val_df['label'])
val_df["text"] = val_df["title"] + val_df["abstract"]
val_df['title'] = NLP_cleaning(val_df.title.values.tolist())
val_df['author'] = NLP_cleaning(val_df.author.values.tolist())
val_df['abstract'] = NLP_cleaning(val_df.abstract.values.tolist())
val_df['text'] = NLP_cleaning(val_df.text.values.tolist())


Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 189120.37it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 131319.23it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 19616.33it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 19575.34it/s]


In [8]:
X=val_df[['text']].copy()
y=val_df[['label_number']].copy()

In [9]:
val_text, test_text, val_labels, test_labels = train_test_split(X,y,random_state=2018,test_size=0.4,shuffle=True)

In [10]:
train_text = data[["abstract","author","title","text"]].copy()

In [11]:
!pip install -q sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


In [12]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
from sklearn.svm import SVC
svm_classifier = SVC()

In [14]:
abc = data['text']
abc = abc.to_list()

In [15]:
embeddings = model.encode(abc)

In [16]:
abco = data['label_number']
abco = abco.to_list()

In [18]:
svm_classifier.fit(embeddings,abco)

In [19]:
abc = test_text['text']
abc = abc.to_list()

In [20]:
embeddings = model.encode(abc)

In [21]:
y_pred = svm_classifier.predict(embeddings)

In [22]:
from sklearn.metrics import classification_report

In [23]:
print(classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.79      0.76        43
           1       0.85      0.80      0.82        50
           2       0.67      0.74      0.70        53
           3       1.00      1.00      1.00         7
           4       0.62      0.45      0.53        44
           5       0.14      0.11      0.12         9
           6       0.54      0.44      0.48        34
           7       0.80      0.67      0.73         6
           8       0.55      0.74      0.63       220
           9       0.50      0.50      0.50         4
          10       0.68      0.64      0.66        95
          12       0.70      0.92      0.80        78
          13       0.60      0.25      0.35        12
          14       0.00      0.00      0.00         1
          15       0.67      0.50      0.57         4
          16       0.00      0.00      0.00         1
          17       0.83      0.62      0.71        16
          19       0.00    

# Proper Experiment

In [26]:
X_train = data['text']
X_train = X_train.to_list()
Y_train = data['label_number']
Y_train = Y_train.to_list()

In [28]:
X_test = val_df['text'].to_list()
Y_test = val_df['label_number'].to_list()

In [29]:
train_embeddings = model.encode(X_train)
test_embeddings = model.encode(X_test)

## SVM

In [30]:
from sklearn.svm import SVC
svm = SVC()

svm_classifier.fit(train_embeddings,Y_train)
y_pred = svm_classifier.predict(test_embeddings)

In [32]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ",accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.7119565217391305
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.72      0.73      0.72        99
           1       0.80      0.80      0.80       126
           2       0.64      0.70      0.67       130
           3       0.86      1.00      0.92        12
           4       0.62      0.52      0.56       118
           5       0.43      0.20      0.27        30
           6       0.61      0.50      0.55       104
           7       0.71      0.56      0.63         9
           8       0.52      0.71      0.60       557
           9       0.60      0.27      0.37        11
          10       0.68      0.68      0.68       225
          11       0.00      0.00      0.00         3
          12       0.70      0.92      0.80       189
          13       0.70      0.33      0.45        42
          14       0.00      0.00      0.00         3
          15       0.83      0.56      0.67         9
 

In [36]:
len(train_embeddings[0])

384

## Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_embeddings, Y_train)

y_pred = rf.predict(test_embeddings)

In [39]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ",accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.5712303422756707
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.55      0.63      0.58        99
           1       0.77      0.70      0.73       126
           2       0.58      0.48      0.53       130
           3       0.88      0.58      0.70        12
           4       0.53      0.15      0.24       118
           5       1.00      0.03      0.06        30
           6       0.46      0.28      0.35       104
           7       0.00      0.00      0.00         9
           8       0.42      0.83      0.56       557
           9       0.00      0.00      0.00        11
          10       0.72      0.45      0.55       225
          11       0.00      0.00      0.00         3
          12       0.46      0.86      0.60       189
          13       0.00      0.00      0.00        42
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         9
 

## Neural Network

In [40]:
import tensorflow as tf

input_dim = 384
num_classes = 123

model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               98560     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 123)               15867     
                                                                 
Total params: 147323 (575.48 KB)
Trainable params: 147323 (575.48 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [47]:
type(train_embeddings[0])

numpy.ndarray

In [53]:
Y_train = np.array(Y_train)

train_accuracy = []
train_loss = []

epochs = 10
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    history = model.fit(train_embeddings, Y_train, batch_size=16)
    train_loss.append(history.history['loss'][0])
    train_acc = history.history['accuracy'][0]
    train_accuracy.append(train_acc)
    print(f"Training Loss: {train_loss[-1]}, Training Accuracy: {train_acc}")


Epoch 1/10
Training Loss: 0.843936562538147, Training Accuracy: 0.7285777926445007

Epoch 2/10
Training Loss: 0.7938205599784851, Training Accuracy: 0.7401815056800842

Epoch 3/10
Training Loss: 0.7596914172172546, Training Accuracy: 0.7476445436477661

Epoch 4/10
Training Loss: 0.7342419624328613, Training Accuracy: 0.753099262714386

Epoch 5/10
Training Loss: 0.6971562504768372, Training Accuracy: 0.7638351917266846

Epoch 6/10
Training Loss: 0.673864483833313, Training Accuracy: 0.7706535458564758

Epoch 7/10
Training Loss: 0.6444944739341736, Training Accuracy: 0.7776455283164978

Epoch 8/10
Training Loss: 0.6290903091430664, Training Accuracy: 0.7822076678276062

Epoch 9/10
Training Loss: 0.6052244305610657, Training Accuracy: 0.7875880002975464

Epoch 10/10
Training Loss: 0.5860788822174072, Training Accuracy: 0.7973817586898804


In [59]:
y_pred_prob = model.predict(test_embeddings)
y_pred = np.argmax(y_pred_prob, axis=1)



In [60]:
len(y_pred)

8648

In [61]:
y_pred

array([ 51, 107,  68, ...,  84,  27,  64])

In [62]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.67368177613321
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.69      0.51      0.58        99
           1       0.76      0.75      0.75       126
           2       0.61      0.73      0.66       130
           3       0.69      0.92      0.79        12
           4       0.57      0.58      0.57       118
           5       0.27      0.27      0.27        30
           6       0.45      0.41      0.43       104
           7       1.00      0.22      0.36         9
           8       0.52      0.73      0.61       557
           9       0.14      0.09      0.11        11
          10       0.64      0.70      0.67       225
          11       0.00      0.00      0.00         3
          12       0.73      0.86      0.79       189
          13       0.62      0.36      0.45        42
          14       0.00      0.00      0.00         3
          15       0.67      0.22      0.33         9
   

In [63]:
## Model 2
input_dim = 384
num_classes = 123

# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               49280     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 256)               33024     
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_5 (Dense)             (None, 512)               131584    
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 dense_6 (Dense)             (None, 123)              

In [64]:
history = model.fit(train_embeddings, Y_train, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [65]:
y_pred_prob = model.predict(test_embeddings)
y_pred = np.argmax(y_pred_prob, axis=1)



In [66]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.6711378353376504
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.59      0.73      0.65        99
           1       0.74      0.78      0.76       126
           2       0.58      0.70      0.63       130
           3       0.79      0.92      0.85        12
           4       0.59      0.48      0.53       118
           5       0.36      0.27      0.31        30
           6       0.53      0.30      0.38       104
           7       0.45      0.56      0.50         9
           8       0.53      0.70      0.60       557
           9       0.00      0.00      0.00        11
          10       0.69      0.63      0.66       225
          11       0.00      0.00      0.00         3
          12       0.76      0.81      0.78       189
          13       0.61      0.48      0.53        42
          14       0.00      0.00      0.00         3
          15       0.38      0.33      0.35         9
 

In [67]:
## Model 3
model3 = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 512)               197120    
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_8 (Dense)             (None, 256)               131328    
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_9 (Dense)             (None, 128)               32896     
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_10 (Dense)            (None, 123)              

In [68]:
history = model3.fit(train_embeddings, Y_train, epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [69]:
y_pred_prob = model3.predict(test_embeddings)
y_pred = np.argmax(y_pred_prob, axis=1)

from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.6667437557816837
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.67      0.62      0.64        99
           1       0.70      0.82      0.75       126
           2       0.71      0.59      0.65       130
           3       0.65      0.92      0.76        12
           4       0.50      0.53      0.51       118
           5       0.33      0.27      0.30        30
           6       0.47      0.45      0.46       104
           7       0.62      0.56      0.59         9
           8       0.55      0.55      0.55       557
           9       0.67      0.18      0.29        11
          10       0.63      0.65      0.64       225
          11       0.00      0.00      0.00         3
          12       0.78      0.82      0.80       189
          13       0.67      0.33      0.44        42
          14       0.00      0.00      0.00         3
          15       0.50      0.56      0.53         9
 

## XGBOOST

In [70]:
import xgboost as xgb

dtrain = xgb.DMatrix(train_embeddings, label=Y_train)
dtest = xgb.DMatrix(test_embeddings, label=Y_test)

# Define XGBoost parameters
param = {
    'objective': 'multi:softmax',
    'num_class': 123,
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'merror'
}

# Train the XGBoost model
num_round = 100
bst = xgb.train(param, dtrain, num_round)

# Predict the labels of the test set
xg_y_pred = bst.predict(dtest)

In [71]:
xg_y_pred

array([ 51., 107.,  68., ...,  84.,  27.,  77.], dtype=float32)

In [75]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(xg_y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, xg_y_pred))

Accuracy:  0.5883441258094357
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.55      0.47      0.51        99
           1       0.65      0.67      0.66       126
           2       0.59      0.55      0.57       130
           3       0.82      0.75      0.78        12
           4       0.44      0.39      0.41       118
           5       0.20      0.13      0.16        30
           6       0.36      0.35      0.35       104
           7       0.60      0.33      0.43         9
           8       0.47      0.60      0.52       557
           9       0.25      0.09      0.13        11
          10       0.56      0.49      0.52       225
          11       0.50      0.33      0.40         3
          12       0.63      0.79      0.70       189
          13       0.47      0.19      0.27        42
          14       0.00      0.00      0.00         3
          15       0.38      0.33      0.35         9
 

In [102]:
## final model
from sklearn.svm import SVC
svm = SVC()

svm.fit(train_embeddings,Y_train)
y_pred = svm.predict(test_embeddings)

In [103]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy: ", accuracy_score(y_pred,Y_test))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred))

Accuracy:  0.7119565217391305
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.72      0.73      0.72        99
           1       0.80      0.80      0.80       126
           2       0.64      0.70      0.67       130
           3       0.86      1.00      0.92        12
           4       0.62      0.52      0.56       118
           5       0.43      0.20      0.27        30
           6       0.61      0.50      0.55       104
           7       0.71      0.56      0.63         9
           8       0.52      0.71      0.60       557
           9       0.60      0.27      0.37        11
          10       0.68      0.68      0.68       225
          11       0.00      0.00      0.00         3
          12       0.70      0.92      0.80       189
          13       0.70      0.33      0.45        42
          14       0.00      0.00      0.00         3
          15       0.83      0.56      0.67         9
 

In [104]:
final_model = svm

In [100]:
test_df = pd.read_csv("/content/forcI_test_no_labels.csv")
test_df

Unnamed: 0,abstract,author,doi,url,publication month,publication year,title,publisher,data_index
0,the paper introduces the notion of a locally c...,Bartosz Zielinski,,,,,Locally coalgebra-Galois extensions,,22901
1,a bipartite state which is secretly chosen fro...,"Yangjia Li, Runyao Duan, and Mingsheng Ying",10.1103/PhysRevA.82.032339,,,,Local Unambiguous Discrimination with Remainin...,,47248
2,this short paper addresses the open problems l...,F\'elix Bou and Carles Noguera,,,,,Solution of some problems in the arithmetical ...,,27601
3,we find the symmetry algebras of cosets which ...,"Dushyant Kumar, Menika Sharma",10.1007/JHEP08(2019)179,,,,Symmetry Algebras of Stringy Cosets,,36511
4,we present the results of the analysis of the ...,"E. Jimenez-Bailon (Universita Roma Tre), M. Sa...",10.1051/0004-6361:20065566,,,,XMM-Newton view of the double-peaked Fe K-alph...,,15483
...,...,...,...,...,...,...,...,...,...
8898,background studies show that the u.s. foreign-...,"['James W. Vaupel', 'Danan Gu', 'Matthew E. Du...",10.1371/journal.pone.0037177,,5.0,2012.0,survival differences among native-born and for...,PLoS ONE,43679
8899,we prove uniqueness for the vortex-wave system...,"Christophe Lacave (ICJ), Evelyne Miot (LJLL)",,,,,Uniqueness for the vortex-wave system when the...,,41425
8900,we demonstrate sensitivity of the mn 3d valenc...,"K. W. Edmonds, G. van der Laan, N. R. S. Farle...",10.1103/PhysRevB.77.113205,,,,Strain dependence of the Mn anisotropy in ferr...,,46483
8901,the coexisting regime of spin density wave (sd...,"Q. Q. Ge, Z. R. Ye, M. Xu, Y. Zhang, J. Jiang,...",10.1103/PhysRevX.3.011020,,,,Anisotropic but nodeless superconducting gap i...,"Phys. Rev. X 3, 011020 (2013)",53461


In [101]:
test_df.drop(['doi','url','publication month', 'publication year','publisher', 'author'], axis =1 , inplace = True)
test_df.fillna('', inplace=True)
test_df['text'] = test_df['title'] + test_df['abstract']
test_df['text'] = NLP_cleaning(test_df.text.values.tolist())
test_df = test_df[['text', 'data_index']]
test_df

Cleaning: 100%|██████████| 8903/8903 [00:00<00:00, 21409.98it/s]


Unnamed: 0,text,data_index
0,locally coalgebra galois extensionsthe paper i...,22901
1,local unambiguous discrimination with remainin...,47248
2,solution of some problems in the arithmetical ...,27601
3,symmetry algebras of stringy cosetswe find the...,36511
4,xmm newton view of the double peaked fe k alph...,15483
...,...,...
8898,survival differences among native born and for...,43679
8899,uniqueness for the vortex wave system when the...,41425
8900,strain dependence of the mn anisotropy in ferr...,46483
8901,anisotropic but nodeless superconducting gap i...,53461


In [105]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [108]:
input = test_df['text'].tolist()
input = model.encode(input)

In [109]:
preds = final_model.predict(input)

In [110]:
preds

array([  0, 107, 120, ...,  68,  73,  35])

In [111]:
preds = label_encoder.inverse_transform(preds)

In [112]:
submission = pd.DataFrame()
submission['data_index'] = test_df['data_index']
submission['target'] = preds
submission

Unnamed: 0,data_index,target
0,22901,Algebra
1,47248,Quantum Physics
2,27601,Theory/Algorithms
3,36511,Quantum Physics
4,15483,Astrophysics and Astronomy
...,...,...
8898,43679,Bioinformatics
8899,41425,Dynamics/Dynamical Systems
8900,46483,Materials Science and Engineering
8901,53461,Mechanics of Materials


In [114]:
import zipfile
submission.to_csv('predictions.csv')
zipfile.ZipFile('predictions.zip', mode='w').write("predictions.csv")

In [115]:
tt = pd.read_csv("/content/forcI_test_no_labels.csv")
tt

Unnamed: 0,abstract,author,doi,url,publication month,publication year,title,publisher,data_index
0,the paper introduces the notion of a locally c...,Bartosz Zielinski,,,,,Locally coalgebra-Galois extensions,,22901
1,a bipartite state which is secretly chosen fro...,"Yangjia Li, Runyao Duan, and Mingsheng Ying",10.1103/PhysRevA.82.032339,,,,Local Unambiguous Discrimination with Remainin...,,47248
2,this short paper addresses the open problems l...,F\'elix Bou and Carles Noguera,,,,,Solution of some problems in the arithmetical ...,,27601
3,we find the symmetry algebras of cosets which ...,"Dushyant Kumar, Menika Sharma",10.1007/JHEP08(2019)179,,,,Symmetry Algebras of Stringy Cosets,,36511
4,we present the results of the analysis of the ...,"E. Jimenez-Bailon (Universita Roma Tre), M. Sa...",10.1051/0004-6361:20065566,,,,XMM-Newton view of the double-peaked Fe K-alph...,,15483
...,...,...,...,...,...,...,...,...,...
8898,background studies show that the u.s. foreign-...,"['James W. Vaupel', 'Danan Gu', 'Matthew E. Du...",10.1371/journal.pone.0037177,,5.0,2012.0,survival differences among native-born and for...,PLoS ONE,43679
8899,we prove uniqueness for the vortex-wave system...,"Christophe Lacave (ICJ), Evelyne Miot (LJLL)",,,,,Uniqueness for the vortex-wave system when the...,,41425
8900,we demonstrate sensitivity of the mn 3d valenc...,"K. W. Edmonds, G. van der Laan, N. R. S. Farle...",10.1103/PhysRevB.77.113205,,,,Strain dependence of the Mn anisotropy in ferr...,,46483
8901,the coexisting regime of spin density wave (sd...,"Q. Q. Ge, Z. R. Ye, M. Xu, Y. Zhang, J. Jiang,...",10.1103/PhysRevX.3.011020,,,,Anisotropic but nodeless superconducting gap i...,"Phys. Rev. X 3, 011020 (2013)",53461


In [116]:
tt.drop(['doi','url','publication month', 'publication year','publisher', 'author'], axis =1 , inplace = True)
rows_with_null = tt[tt.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,abstract,title,data_index
58,,realized vs apparent reduction in enemies of t...,11400
132,,open access papers: their growth over time and...,40326
141,,general-purpose question-answering with macaw,19412
208,,inhibition between invasives: a newly introduc...,7509
214,,impact of nb(v) substitution on the structure ...,18447
...,...,...,...
8739,,emergency response information systems: emergi...,2519
8781,,cloud computing,17405
8818,,an improved decision support system for detect...,48096
8856,,sustainable forest management in sardegna: fro...,52861


In [119]:
li = rows_with_null['title']
li = li.to_list()
li

['realized vs apparent reduction in enemies of the european starling',
 'open access papers: their growth over time and from different countries, and their citations',
 'general-purpose question-answering with macaw',
 'inhibition between invasives: a newly introduced predator moderates the impacts of a previously established invasive predator',
 'impact of nb(v) substitution on the structure and optical and photoelectrochemical properties of the cu5(ta1−xnbx)11o30 solid solution',
 'occupancy anticipation for efficient exploration and navigation',
 'strengthening the implementation of town planning laws and property management: panacea for reducing incidence of building collapse in nigeria',
 'n‐decyloleat [mak value documentation in german language, 2019]',
 'a single ectomycorrhizal fungal species can enable a pinus invasion',
 'cryptic seedling herbivory by nocturnal introduced generalists impacts survival, performance of native and exotic plants',
 '1,4-dioxane. mak value document