In [4]:
import pandas as pd
from tqdm import tqdm
import re

In [5]:
data = pd.read_csv('/kaggle/input/kaggle/train.csv')
data.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
data = data.dropna()
data["text"] = data["title"] + data["abstract"]



def NLP_cleaning(text):
    text_corpus = []
    i=0
    for sent in tqdm(text, desc='Cleaning'):
        # print(i, end =" ")
        i+=1
        sent = re.sub('<[^>]*>', '', sent)
        sent = re.sub('[^a-zA-z0-9]', ' ', sent)
        sent = sent.lower()
        text_corpus.append(sent)

    return text_corpus


text = data.text.values.tolist()
text_corpus = NLP_cleaning(text)
data['text'] = text_corpus
data['title'] = NLP_cleaning(data.title.values.tolist())
data['author'] = NLP_cleaning(data.author.values.tolist())



from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['label_number'] = label_encoder.fit_transform(data['label'])


val_df = pd.read_csv('/kaggle/input/kaggle/val.csv')
val_df.drop(['doi','url','publication month', 'publication year','publisher', 'data_index'], axis =1 , inplace = True)
val_df = val_df.dropna()
val_df['label_number'] = label_encoder.transform(val_df['label'])
val_df["text"] = val_df["title"] + val_df["abstract"]
val_df['title'] = NLP_cleaning(val_df.title.values.tolist())
val_df['author'] = NLP_cleaning(val_df.author.values.tolist())
val_df['abstract'] = NLP_cleaning(val_df.abstract.values.tolist())
val_df['text'] = NLP_cleaning(val_df.text.values.tolist())



!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')




Cleaning: 100%|██████████| 40332/40332 [00:03<00:00, 11936.94it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 103696.38it/s]
Cleaning: 100%|██████████| 40332/40332 [00:00<00:00, 85508.26it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 110582.36it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 83774.48it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 12358.51it/s]
Cleaning: 100%|██████████| 8648/8648 [00:00<00:00, 11908.37it/s]
  return self.fget.__get__(instance, owner)()


In [6]:
X_train = data['text']
X_train = X_train.to_list()
Y_train = data['label_number']
Y_train = Y_train.to_list()

In [7]:
X_test = val_df['text'].to_list()
Y_test = val_df['label_number'].to_list()

In [8]:
train_embeddings = model.encode(X_train)
test_embeddings = model.encode(X_test)

Batches:   0%|          | 0/1261 [00:00<?, ?it/s]

Batches:   0%|          | 0/271 [00:00<?, ?it/s]

In [9]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors as per your requirement

# Train the KNN classifier
knn_classifier.fit(train_embeddings, Y_train)

# Make predictions on the test data
y_pred_knn = knn_classifier.predict(test_embeddings)

# Evaluate the performance of the KNN classifier
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(Y_test, y_pred_knn))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred_knn))


Accuracy: 0.6916049953746531
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.65      0.69      0.67        99
           1       0.76      0.79      0.78       126
           2       0.61      0.66      0.64       130
           3       0.80      1.00      0.89        12
           4       0.49      0.59      0.53       118
           5       0.24      0.27      0.25        30
           6       0.47      0.49      0.48       104
           7       0.45      0.56      0.50         9
           8       0.50      0.63      0.56       557
           9       0.25      0.09      0.13        11
          10       0.60      0.73      0.66       225
          11       0.00      0.00      0.00         3
          12       0.75      0.87      0.81       189
          13       0.47      0.36      0.41        42
          14       0.00      0.00      0.00         3
          15       0.60      0.67      0.63         9
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from sklearn.metrics import classification_report, accuracy_score
import joblib
from tqdm import tqdm

# Load the SVM model using joblib
overclass = joblib.load('/kaggle/input/modelsvm/ovr_classifier.pkl')

# Assuming test_embeddings and Y_test are defined
# Evaluate the performance of the SVM classifier

# Initialize lists to store predictions
y_pred_svm = []

# Use tqdm to display progress
with tqdm(total=len(test_embeddings), desc="Evaluating SVM Classifier") as pbar:
    for sample in test_embeddings:
        # Predict the label for each sample
        y_pred_svm.append(overclass.predict([sample])[0])
        # Update progress bar
        pbar.update(1)

# Convert the list to numpy array



Evaluating SVM Classifier: 100%|██████████| 8648/8648 [13:20<00:00, 10.80it/s]


NameError: name 'np' is not defined

In [15]:
import numpy as np
y_pred_svm = np.array(y_pred_svm)

# Calculate accuracy and print classification report
print("Accuracy:", accuracy_score(Y_test, y_pred_svm))
print("-----------------------------------------------\n\n")
print(classification_report(Y_test, y_pred_svm))

Accuracy: 0.72895467160037
-----------------------------------------------


              precision    recall  f1-score   support

           0       0.75      0.70      0.72        99
           1       0.81      0.79      0.80       126
           2       0.65      0.70      0.67       130
           3       0.86      1.00      0.92        12
           4       0.67      0.57      0.61       118
           5       0.30      0.10      0.15        30
           6       0.65      0.53      0.58       104
           7       0.83      0.56      0.67         9
           8       0.58      0.66      0.62       557
           9       0.50      0.09      0.15        11
          10       0.71      0.69      0.70       225
          11       0.00      0.00      0.00         3
          12       0.77      0.92      0.84       189
          13       0.81      0.40      0.54        42
          14       0.00      0.00      0.00         3
          15       0.67      0.67      0.67         9
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=12)

# Initialize BaggingClassifier with KNN as the base estimator
bagging_classifier = BaggingClassifier(base_estimator=knn_classifier, n_estimators=10, random_state=42)

# Train the BaggingClassifier
bagging_classifier.fit(train_embeddings, Y_train)

# Make predictions on the test data
y_pred_bagging = bagging_classifier.predict(test_embeddings)

# Evaluate the performance of the BaggingClassifier with KNN
print("Bagging Classifier Performance:")
print("Accuracy:", accuracy_score(Y_test, y_pred_bagging))
print(classification_report(Y_test, y_pred_bagging))
print("-----------------------------------------------\n\n")






Bagging Classifier Performance:
Accuracy: 0.7035152636447733
              precision    recall  f1-score   support

           0       0.73      0.68      0.70        99
           1       0.79      0.80      0.80       126
           2       0.66      0.62      0.64       130
           3       0.86      1.00      0.92        12
           4       0.60      0.52      0.55       118
           5       0.40      0.07      0.11        30
           6       0.57      0.45      0.50       104
           7       0.75      0.33      0.46         9
           8       0.53      0.61      0.56       557
           9       0.00      0.00      0.00        11
          10       0.65      0.72      0.68       225
          11       1.00      0.33      0.50         3
          12       0.75      0.88      0.81       189
          13       0.65      0.36      0.46        42
          14       0.00      0.00      0.00         3
          15       0.75      0.67      0.71         9
          16       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'max_iter': [100, 500, 1000]  # Maximum number of iterations
}

# Initialize Logistic Regression classifier
logistic_regression_classifier = LogisticRegression()

# Initialize GridSearchCV
grid_search = GridSearchCV(logistic_regression_classifier, param_grid, cv=5, scoring='accuracy')

# Perform grid search to find the best parameters
grid_search.fit(train_embeddings, Y_train)

# Get the best estimator
best_logistic_regression_classifier = grid_search.best_estimator_

# Make predictions on the test data
y_pred_best_logistic_regression = best_logistic_regression_classifier.predict(test_embeddings)

# Evaluate the performance of the Logistic Regression classifier with the best parameters
print("Best Logistic Regression Classifier Performance:")
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(Y_test, y_pred_best_logistic_regression))
print(classification_report(Y_test, y_pred_best_logistic_regression))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Logistic Regression Classifier Performance:
Best Parameters: {'C': 10, 'max_iter': 500}
Accuracy: 0.6763413506012951
              precision    recall  f1-score   support

           0       0.70      0.65      0.67        99
           1       0.77      0.79      0.78       126
           2       0.63      0.66      0.65       130
           3       0.79      0.92      0.85        12
           4       0.55      0.48      0.51       118
           5       0.40      0.27      0.32        30
           6       0.45      0.38      0.41       104
           7       0.40      0.44      0.42         9
           8       0.55      0.62      0.59       557
           9       0.25      0.18      0.21        11
          10       0.64      0.65      0.64       225
          11       0.00      0.00      0.00         3
          12       0.72      0.82      0.77       189
          13       0.64      0.33      0.44        42
          14       0.00      0.00      0.00         3
          15 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.ensemble import VotingClassifier

# Assuming you already have best_logistic_regression_classifier, bagging_classifier, and overclass models

# Create a list of tuples containing (name, model) pairs
estimators = [('Logistic Regression', best_logistic_regression_classifier),
              ('Bagging Classifier', bagging_classifier),
              ('Overclass', overclass)]

# Initialize the VotingClassifier with the estimators
voting_classifier = VotingClassifier(estimators, voting='hard')  # You can use 'soft' voting if models provide predict_proba

# Train the VotingClassifier
voting_classifier.fit(train_embeddings, Y_train)

# Make predictions on the test data
y_pred_voting = voting_classifier.predict(test_embeddings)

# Evaluate the performance of the VotingClassifier
print("Voting Classifier Performance:")
print("Accuracy:", accuracy_score(Y_test, y_pred_voting))
print(classification_report(Y_test, y_pred_voting))


