In [1]:
# Required Libraries
!pip install PyPDF2 scikit-learn numpy pandas

import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from PyPDF2 import PdfReader

# 1. Load PDF Files

def extract_text_from_folders(publishable_folder, non_publishable_folder):
    """Extracts text from PDF files in two folders and merges them."""
    texts, labels = [], []

    # Process Publishable PDFs
    for filename in os.listdir(publishable_folder):
        if filename.endswith('.pdf'):
            filepath = os.path.join(publishable_folder, filename)
            reader = PdfReader(filepath)
            text = "".join(page.extract_text() for page in reader.pages)
            texts.append(text)
            labels.append(1)  # Label 1 for Publishable

    # Process Non-Publishable PDFs
    for filename in os.listdir(non_publishable_folder):
        if filename.endswith('.pdf'):
            filepath = os.path.join(non_publishable_folder, filename)
            reader = PdfReader(filepath)
            text = "".join(page.extract_text() for page in reader.pages)
            texts.append(text)
            labels.append(0)  # Label 0 for Non-Publishable

    return texts, labels

publishable_folder = '/kaggle/input/second-training-set/train2'  # Path to publishable papers dataset
non_publishable_folder = '/kaggle/input/reference/Reference/Non-Publishable'  # Path to non-publishable papers dataset
texts, labels = extract_text_from_folders(publishable_folder, non_publishable_folder)

# 2. Preprocess Data

# Convert to DataFrame for easier handling
data = pd.DataFrame({'text': texts, 'label': labels})

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42)

# 3. Feature Extraction Using TF-IDF

vectorizer = TfidfVectorizer(max_features=15000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 4. Train SVM Classifier

svm_model = SVC(kernel='linear', probability=True, random_state=42,class_weight='balanced')
svm_model.fit(X_train_tfidf, y_train)

# 5. Evaluate the Model

y_pred = svm_model.predict(X_test_tfidf)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 6. Save the Model and Vectorizer

import joblib

joblib.dump(svm_model, '/kaggle/working/svm_publishability_model.pkl')
joblib.dump(vectorizer, '/kaggle/working/tfidf_vectorizer.pkl')

# 7. Make Predictions and Save Results for New Reference Papers

def predict_new_reference_papers(pdf_folder, model, vectorizer):
    """Predict publishability for new reference PDFs and save the output."""
    texts = []
    filenames = []

    for filename in os.listdir(pdf_folder):
        if filename.endswith('.pdf'):
            filepath = os.path.join(pdf_folder, filename)
            reader = PdfReader(filepath)
            text = "".join(page.extract_text() for page in reader.pages)
            texts.append(text)
            filenames.append(filename)

    new_features = vectorizer.transform(texts)
    predictions = model.predict(new_features)

    # Save predictions to CSV
    new_results = pd.DataFrame({
        'filename': filenames,
        'text': texts,
        'predicted_label': predictions
    })
    new_results_path = '/kaggle/working/new_reference_papers_results.csv'
    new_results.to_csv(new_results_path, index=False, escapechar='\\')
    print(f"Predictions for new reference papers saved to: {new_results_path}")

    # Provide download link
    from IPython.display import FileLink, display
    display(FileLink(new_results_path))

# Example Usage for New Reference Papers
pdf_folder = '/kaggle/input/test-dataset/Papers'  # Path to new reference papers dataset
predict_new_reference_papers(pdf_folder, svm_model, vectorizer)


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8

Confusion Matrix:
 [[1 0]
 [0 7]]
Predictions for new reference papers saved to: /kaggle/working/new_reference_papers_results.csv


In [3]:
# Addressing Class Imbalance with SMOTE
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42,k_neighbors=2)
X_train_tfidf_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

# Update SVM with class weights
svm_model_balanced = SVC(kernel='linear', probability=True, class_weight='balanced', random_state=42)
svm_model_balanced.fit(X_train_tfidf_balanced, y_train_balanced)

# Evaluate the balanced model
y_pred_balanced = svm_model_balanced.predict(X_test_tfidf)
print("Classification Report (Balanced):\n", classification_report(y_test, y_pred_balanced))
print("Confusion Matrix (Balanced):\n", confusion_matrix(y_test, y_pred_balanced))

# Adding additional metrics
from sklearn.metrics import matthews_corrcoef, balanced_accuracy_score

mcc = matthews_corrcoef(y_test, y_pred_balanced)
balanced_acc = balanced_accuracy_score(y_test, y_pred_balanced)
print(f"Matthews Correlation Coefficient: {mcc}")
print(f"Balanced Accuracy Score: {balanced_acc}")


Classification Report (Balanced):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         7

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8

Confusion Matrix (Balanced):
 [[1 0]
 [0 7]]
Matthews Correlation Coefficient: 1.0
Balanced Accuracy Score: 1.0


In [4]:
# Required Libraries
!pip install PyPDF2 scikit-learn numpy pandas

import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from PyPDF2 import PdfReader

# 1. Load Training Data for Conferences

def load_training_data(conference_folder):
    """Loads training data for conferences from a labeled dataset."""
    texts, labels = [], []

    # Each subfolder corresponds to a conference (NeurIPS, CVPR, etc.)
    for conference in os.listdir(conference_folder):
        conference_path = os.path.join(conference_folder, conference)
        if os.path.isdir(conference_path):
            for filename in os.listdir(conference_path):
                if filename.endswith('.pdf'):
                    filepath = os.path.join(conference_path, filename)
                    reader = PdfReader(filepath)
                    text = "".join(page.extract_text() for page in reader.pages)
                    texts.append(text)
                    labels.append(conference)

    return texts, labels

# Path to the labeled dataset for conferences
conference_folder = '/kaggle/input/publishable-task2/Publishable'  # Update with your Kaggle dataset folder
texts, labels = load_training_data(conference_folder)

# 2. Preprocess Data

# Convert to DataFrame for easier handling
data = pd.DataFrame({'text': texts, 'label': labels})

# Feature Extraction Using TF-IDF

vectorizer = TfidfVectorizer(max_features=15000,stop_words='english')
X_tfidf = vectorizer.fit_transform(data['text'])

# Train SVM Classifier

svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_tfidf, data['label'])

# Save the Trained Model and Vectorizer
import joblib

joblib.dump(svm_model, '/kaggle/working/svm_conference_model.pkl')
joblib.dump(vectorizer, '/kaggle/working/tfidf_vectorizer_conference.pkl')

# 3. Load Publishable Papers from Task 1 and Predict Conferences

def predict_conferences(task1_results_path, model, vectorizer):
    """Predict conferences for publishable papers from Task 1 results."""
    # Load Task 1 results
    task1_results = pd.read_csv(task1_results_path)
    publishable_papers = task1_results[task1_results['predicted_label'] == 1]

    # Extract features for publishable papers
    publishable_texts = publishable_papers['text'].tolist()
    publishable_filenames = publishable_papers['filename'].tolist()
    features = vectorizer.transform(publishable_texts)

    # Predict conferences
    predictions = model.predict(features)

    conference_rationale = {
        'NeurIPS': 'NeurIPS focuses on artificial intelligence, machine learning, deep learning, reinforcement learning, and neural networks.',
        'CVPR': 'CVPR focuses on computer vision, image processing, pattern recognition, and object detection, with applications in autonomous systems.',
        'EMNLP': 'EMNLP deals with natural language processing (NLP), computational linguistics, and machine learning approaches for NLP.',
        'TMLR': 'TMLR focuses on novel machine learning techniques, methodologies, and frameworks, advancing research in machine learning.',
        'KDD': 'KDD emphasizes data mining, big data analytics, machine learning, and knowledge discovery from large datasets.'
    }
    rationales = [conference_rationale.get(pred, "No specific rationale available.") for pred in predictions]
    
    # Save predictions to CSV
    results = pd.DataFrame({
        'filename': publishable_filenames,
        'predicted_conference': predictions,
        'rationale': rationales  # Placeholder rationale
    })
    results_path = '/kaggle/working/conference_classification_results.csv'
    results.to_csv(results_path, index=False)
    print(f"Conference classification results saved to: {results_path}")

    # Provide download link
    from IPython.display import FileLink, display
    display(FileLink(results_path))

# Example Usage for Task 2
# Path to Task 1 results
task1_results_path = '/kaggle/working/new_reference_papers_results.csv'  # Update as needed
predict_conferences(task1_results_path, svm_model, vectorizer)




Conference classification results saved to: /kaggle/working/conference_classification_results.csv
