### Step 1: Load train_preprocessed.csv

In [1]:
import pandas as pd

train_df = pd.read_csv("train_preprocessed.csv")
print(f"✅ Loaded {len(train_df)} rows from train_preprocessed.csv")
train_df.head()


✅ Loaded 21215 rows from train_preprocessed.csv


Unnamed: 0,article_id,ref_id,clean_context,dataset_type
0,10.1002_2017jc013030,,"Volk and Hoffert, 1985;",Primary
1,10.1002_2017jc013030,,"Honjo et al., 2014;",Primary
2,10.1002_2017jc013030,,"Legendre et al., 2015",Primary
3,10.1002_2017jc013030,,"Riser and Johnson, 2008;",Primary
4,10.1002_2017jc013030,,"Graff et al., 2012",Primary


### Step 2: Encode labels and split dataset

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

# Encode labels
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['dataset_type'])  # Primary=0, Secondary=1

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(
    train_df['clean_context'], train_df['label'],
    test_size=0.1, random_state=42, stratify=train_df['label']
)

# Drop rows with empty or NaN clean_context
X_train = X_train[X_train.notnull()]
y_train = y_train[X_train.index]

X_val = X_val[X_val.notnull()]
y_val = y_val[X_val.index]

print(f"✅ After cleaning: Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

# Optional: display first few training samples
X_train.head()


✅ After cleaning: Training samples: 17206, Validation samples: 1950


11019    Our previous attempts to identify putative ant...
9236     Sequences of the E1HVR1 region of the HCV geno...
7321                                                    2,
13613    A further six loci identified as F ST outliers...
17539                                                   19
Name: clean_context, dtype: object

In [3]:
# Keep only rows with at least 3 characters and not purely numeric
def is_valid(text):
    text = str(text).strip()
    return len(text) > 2 and not text.isnumeric()

X_train = X_train[X_train.apply(is_valid)]
y_train = y_train[X_train.index]

X_val = X_val[X_val.apply(is_valid)]
y_val = y_val[X_val.index]

print(f"✅ After removing short/numeric rows: Training samples: {len(X_train)}, Validation samples: {len(X_val)}")


✅ After removing short/numeric rows: Training samples: 15445, Validation samples: 1730


### Step 3: TF-IDF vectorization

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

# Fit on training data and transform both training and validation sets
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

print(f"✅ TF-IDF feature matrix shapes -> Train: {X_train_tfidf.shape}, Validation: {X_val_tfidf.shape}")


✅ TF-IDF feature matrix shapes -> Train: (15445, 5000), Validation: (1730, 5000)


### Step 4: Train baseline classifier

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Initialize and train Logistic Regression
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_tfidf, y_train)

# Predict on validation set
y_pred = clf.predict(X_val_tfidf)

# Evaluate performance
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
print(f"✅ Baseline Accuracy: {acc:.4f}, F1-score: {f1:.4f}\n")

# Detailed classification report
print("Classification Report:\n", classification_report(y_val, y_pred, target_names=le.classes_))


✅ Baseline Accuracy: 0.9104, F1-score: 0.8283

Classification Report:
               precision    recall  f1-score   support

     Primary       0.89      0.99      0.94      1210
   Secondary       0.98      0.72      0.83       520

    accuracy                           0.91      1730
   macro avg       0.93      0.86      0.88      1730
weighted avg       0.92      0.91      0.91      1730

