In [None]:
import numpy as np
import pandas as pd
import transformers
import torch
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch import nn
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('/content/drive/MyDrive/SC DATASET/train.csv')
data.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
data = data.dropna()
data.isnull().sum()
data["text"] = data["title"] + data["abstract"]

In [None]:
def NLP_cleaning(text):
    text_corpus = []
    for sent in tqdm(text, desc='Cleaning'):
        sent = re.sub('<[^>]*>', '', sent)
        sent = re.sub('[^a-zA-z0-9]', ' ', sent)
        sent = sent.lower()
        text_corpus.append(sent)

    return text_corpus

In [None]:
text = data.text.values.tolist()
text_corpus = NLP_cleaning(text)
data['text'] = text_corpus
data['title'] = NLP_cleaning(data.title.values.tolist())
data['author'] = NLP_cleaning(data.author.values.tolist())

Cleaning: 100%|██████████| 40332/40332 [00:02<00:00, 16871.13it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 149746.98it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 113439.31it/s]


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['label_number'] = label_encoder.fit_transform(data['label'])
data

Unnamed: 0,abstract,author,title,label,text,label_number
0,the production of b jets in association with a...,cms collaboration,measurement of the z gamma b jet cross sect...,Physics,measurement of the z gamma b jet cross sect...,95
1,instabilities in the price dynamics of a large...,giacomo bormetti lucio maria calcagnile mich...,modelling systemic price cojumps with hawkes f...,Quantitative Finance,modelling systemic price cojumps with hawkes f...,105
2,large information sizes in samples and feature...,david banh alan huang,encoding large information structures in linea...,Machine Learning,encoding large information structures in linea...,67
3,we consider polygonal billiards with collision...,gianluigi del magno jo\ ao lopes dias pedro ...,hyperbolic polygonal billiards close to 1 dime...,Dynamics/Dynamical Systems,hyperbolic polygonal billiards close to 1 dime...,41
4,Bauxite deposits of Jharkhand in India are res...,[ e n dhanamjaya rao a t jeyaseelan k ...,analysis of aster data for mapping bauxite ric...,Sociology,analysis of aster data for mapping bauxite ric...,114
...,...,...,...,...,...,...
41534,programs offered by academic institutions in h...,[ alex ferworn muthana zouri ],an ontology based approach for curriculum mapp...,Computer Engineering,an ontology based approach for curriculum mapp...,25
41535,this research addresses the competencies organ...,[ sabik khan marcus ho kamrul ahsan ],recruiting project managers a comparative ana...,Sociology,recruiting project managers a comparative ana...,114
41536,this paper studies an optimal stopping problem...,diana dorobantu lsproba,optimal stopping for l\ evy processes and affi...,Statistics and Probability,optimal stopping for l\ evy processes and affi...,118
41537,we examine the possible extension of the param...,john ellis joel giedt oleg lebedev keith ol...,against tachyophobia,Physics,against tachyophobiawe examine the possible ex...,95


In [None]:
val_df = pd.read_csv('/content/drive/MyDrive/SC DATASET/val.csv')
val_df.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
val_df = val_df.dropna()
val_df['label_number'] = label_encoder.transform(val_df['label'])
val_df["text"] = val_df["title"] + val_df["abstract"]
val_df['title'] = NLP_cleaning(val_df.title.values.tolist())
val_df['author'] = NLP_cleaning(val_df.author.values.tolist())
val_df['abstract'] = NLP_cleaning(val_df.abstract.values.tolist())
val_df['text'] = NLP_cleaning(val_df.text.values.tolist())


Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 158799.48it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 117177.27it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 18182.03it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 15951.69it/s]


In [None]:
X=val_df[['text']].copy()
y=val_df[['label_number']].copy()

In [None]:
val_text, test_text, val_labels, test_labels = train_test_split(X,y,random_state=2018,test_size=1,shuffle=True)

In [None]:
train_text = data[["abstract","author","title","text"]].copy()

In [None]:
pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
X_train = data['text']
X_train = X_train.to_list()
Y_train = data['label_number']
Y_train = Y_train.to_list()

In [None]:
X_test = val_df['text'].to_list()
Y_test = val_df['label_number'].to_list()

In [None]:
import numpy as np
import gensim.downloader as api

# Download the Word2Vec model
word2vec_model = api.load("word2vec-google-news-300")

# Define a function to encode a sentence using Word2Vec embeddings
def encode_sentence(sentence, model, embedding_size=300):
    tokens = sentence.split()
    embeddings = np.zeros((len(tokens), embedding_size))
    for i, token in enumerate(tokens):
        if token in model:
            embeddings[i] = model[token]
    if embeddings.shape[0] == 0:
        return np.zeros(embedding_size)  # Return zeros if no valid tokens found
    else:
        return np.mean(embeddings, axis=0)  # Average the word embeddings

# Example usage:
X_train_encoded = [encode_sentence(sentence, word2vec_model) for sentence in X_train]
X_test_encoded = [encode_sentence(sentence, word2vec_model) for sentence in X_test]




In [None]:
train_embeddings = X_train_encoded
test_embeddings = X_test_encoded

In [None]:
for i in range(0,len(train_embeddings)):
  if(len(train_embeddings[i])!=100):
    print(i, end=",")

In [None]:
import tensorflow as tf
input_dim = 300
num_classes = 123

# Define the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()
history = model.fit(train_embeddings, np.array(Y_train), epochs=30, batch_size=32)
