In [1]:
import pandas as pd
from tqdm import tqdm
import re

In [2]:
data = pd.read_csv('/kaggle/input/project1/train.csv')
data.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
data = data.dropna()
data["text"] = data["title"] + data["abstract"]



def NLP_cleaning(text):
    text_corpus = []
    i=0
    for sent in tqdm(text, desc='Cleaning'):
        # print(i, end =" ")
        i+=1
        sent = re.sub('<[^>]*>', '', sent)
        sent = re.sub('[^a-zA-z0-9]', ' ', sent)
        sent = sent.lower()
        text_corpus.append(sent)

    return text_corpus


text = data.text.values.tolist()
text_corpus = NLP_cleaning(text)
data['text'] = text_corpus
data['title'] = NLP_cleaning(data.title.values.tolist())
data['author'] = NLP_cleaning(data.author.values.tolist())



from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['label_number'] = label_encoder.fit_transform(data['label'])


val_df = pd.read_csv('/kaggle/input/project1/val.csv')
val_df.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
val_df = val_df.dropna()
val_df['label_number'] = label_encoder.transform(val_df['label'])
val_df["text"] = val_df["title"] + val_df["abstract"]
val_df['title'] = NLP_cleaning(val_df.title.values.tolist())
val_df['author'] = NLP_cleaning(val_df.author.values.tolist())
val_df['abstract'] = NLP_cleaning(val_df.abstract.values.tolist())
val_df['text'] = NLP_cleaning(val_df.text.values.tolist())



!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')




Cleaning: 100%|██████████| 40332/40332 [00:03<00:00, 11064.06it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 99990.05it/s] 
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 79382.17it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 102994.05it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 78561.04it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 12354.93it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 11839.91it/s]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [5]:
import pandas as pd
import spacy


df = data

# Load spaCy model with word vectors (you might need to download this model)
nlp = spacy.load("en_core_web_md")

# Function for synonym replacement
def replace_synonyms(text):
    doc = nlp(text)
    augmented_text = [token.text if token.text == token.lemma_ else token.lemma_ for token in doc]
    return ' '.join(augmented_text)

# Count the occurrences of each class in the 'label' feature
class_counts = df['label'].value_counts().to_dict()

# Create a new DataFrame for augmented data
augmented_data = {'augmented_text': [], 'label': []}

# Apply synonym replacement and store in the new DataFrame
for index, row in df.iterrows():
    augmented_text = replace_synonyms(row['text']) if class_counts[row['label']] < 50 else row['text']
    augmented_data['augmented_text'].append(augmented_text)
    augmented_data['label'].append(row['label'])

# Create the new DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Display the augmented DataFrame
print(augmented_df)


                                          augmented_text  \
0      measurement of the z gamma    b jet cross sect...   
1      modelling systemic price cojumps with hawkes f...   
2      encoding large information structures in linea...   
3      hyperbolic polygonal billiards close to 1 dime...   
4      analysis of aster datum for map bauxite rich p...   
...                                                  ...   
40327  an ontology based approach for curriculum mapp...   
40328  recruiting project manager   a comparative ana...   
40329  optimal stopping for l\ evy processes and affi...   
40330  against tachyophobiawe examine the possible ex...   
40331  studies on some aspects of the fundamental the...   

                                    label  
0                                 Physics  
1                    Quantitative Finance  
2                        Machine Learning  
3              Dynamics/Dynamical Systems  
4                               Sociology  
...            

In [6]:
augmented_df

Unnamed: 0,augmented_text,label
0,measurement of the z gamma b jet cross sect...,Physics
1,modelling systemic price cojumps with hawkes f...,Quantitative Finance
2,encoding large information structures in linea...,Machine Learning
3,hyperbolic polygonal billiards close to 1 dime...,Dynamics/Dynamical Systems
4,analysis of aster datum for map bauxite rich p...,Sociology
...,...,...
40327,an ontology based approach for curriculum mapp...,Computer Engineering
40328,recruiting project manager a comparative ana...,Sociology
40329,optimal stopping for l\ evy processes and affi...,Statistics and Probability
40330,against tachyophobiawe examine the possible ex...,Physics


In [7]:
augmented_df.columns = ["text","label"]

In [8]:
new_df = data[["text", "label"]].copy()

In [9]:
merged_df = pd.concat([new_df, augmented_df], ignore_index=True)


In [10]:
data = merged_df

In [11]:
data

Unnamed: 0,text,label
0,measurement of the z gamma b jet cross sect...,Physics
1,modelling systemic price cojumps with hawkes f...,Quantitative Finance
2,encoding large information structures in linea...,Machine Learning
3,hyperbolic polygonal billiards close to 1 dime...,Dynamics/Dynamical Systems
4,analysis of aster data for mapping bauxite ric...,Sociology
...,...,...
80659,an ontology based approach for curriculum mapp...,Computer Engineering
80660,recruiting project manager a comparative ana...,Sociology
80661,optimal stopping for l\ evy processes and affi...,Statistics and Probability
80662,against tachyophobiawe examine the possible ex...,Physics


In [12]:
label_encoder = LabelEncoder()

# Fit and transform the 'label' column
data['label_number'] = label_encoder.fit_transform(data['label'])

data

Unnamed: 0,text,label,label_number
0,measurement of the z gamma b jet cross sect...,Physics,95
1,modelling systemic price cojumps with hawkes f...,Quantitative Finance,105
2,encoding large information structures in linea...,Machine Learning,67
3,hyperbolic polygonal billiards close to 1 dime...,Dynamics/Dynamical Systems,41
4,analysis of aster data for mapping bauxite ric...,Sociology,114
...,...,...,...
80659,an ontology based approach for curriculum mapp...,Computer Engineering,25
80660,recruiting project manager a comparative ana...,Sociology,114
80661,optimal stopping for l\ evy processes and affi...,Statistics and Probability,118
80662,against tachyophobiawe examine the possible ex...,Physics,95


In [13]:
X_train = data['text']
X_train = X_train.to_list()
Y_train = data['label_number']
Y_train = Y_train.to_list()

In [14]:
X_test = val_df['text'].to_list()
Y_test = val_df['label_number'].to_list()

In [15]:
train_embeddings = model.encode(X_train)
test_embeddings = model.encode(X_test)

Batches:   0%|          | 0/2521 [00:00<?, ?it/s]

Batches:   0%|          | 0/271 [00:00<?, ?it/s]

In [17]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

# Create BaggingClassifier with SVM as the base estimator
bagging_svm = BaggingClassifier(base_estimator=SVC(C=1.5, kernel='poly'), n_estimators=10, random_state=42)

# Fit the bagging classifier on training data
bagging_svm.fit(train_embeddings, Y_train)

# Predict on test data
y_pred_bagging = bagging_svm.predict(test_embeddings)

# Evaluate the bagging classifier
print("Accuracy (Bagging): ", accuracy_score(Y_test, y_pred_bagging))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred_bagging))




NameError: name 'accuracy_score' is not defined

In [18]:
print("hi")

Accuracy (Bagging):  0.7129972247918593
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.69      0.69      0.69        99
           1       0.82      0.79      0.80       126
           2       0.62      0.68      0.65       130
           3       0.92      1.00      0.96        12
           4       0.66      0.58      0.62       118
           5       0.31      0.13      0.19        30
           6       0.56      0.55      0.55       104
           7       1.00      0.44      0.62         9
           8       0.56      0.69      0.62       557
           9       1.00      0.27      0.43        11
          10       0.69      0.69      0.69       225
          11       0.00      0.00      0.00         3
          12       0.63      0.93      0.75       189
          13       0.68      0.31      0.43        42
          14       0.00      0.00      0.00         3
          15       0.83      0.56      0.67  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
