In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ml4cpsp1/train.json
/kaggle/input/ml4cpsp1/test.json


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

# ===================== DATA PREPROCESSING ========================
df_train = pd.read_json('/kaggle/input/ml4cps/train.json').T
df_test = pd.read_json('/kaggle/input/ml4cps/test.json').T

def ensure_list(x): return x if isinstance(x, list) else [x]
df_train['contributor'] = df_train['contributor'].apply(ensure_list)
df_test['contributor'] = df_test['contributor'].apply(ensure_list)

venue_contrib_history, year_contrib_history, coauthor_counts = {}, {}, {}
for _, row in df_train.iterrows():
    venue, year, contribs = row['article_venue'], row['article_year'], row['contributor']
    for c in contribs:
        venue_contrib_history.setdefault(c, {}).setdefault(venue, 0)
        venue_contrib_history[c][venue] += 1
        year_contrib_history.setdefault(c, {}).setdefault(year, 0)
        year_contrib_history[c][year] += 1
    for c1 in contribs:
        for c2 in contribs:
            if c1 != c2:
                coauthor_counts.setdefault(c1, {}).setdefault(c2, 0)
                coauthor_counts[c1][c2] += 1

def expand_positives(df):
    rows = []
    for _, row in df.iterrows():
        all_cont = row['contributor']
        for c in all_cont:
            other_cont = [x for x in all_cont if x != c]
            rows.append({'article_id': row['id'], 'article_venue': row['article_venue'], 'article_year': row['article_year'],
                         'text': row['text'], 'candidate': c, 'contributor': other_cont, 'label': 1})
    return pd.DataFrame(rows)

def generate_negatives(df, all_contrib, n_neg=2, seed=42):
    rng = np.random.default_rng(seed)
    negs = []
    for _, row in df.iterrows():
        true_cont = set(row['contributor'])
        possible_negs = list(all_contrib - true_cont)
        sampled = rng.choice(possible_negs, size=min(n_neg, len(possible_negs)), replace=False)
        for c in sampled:
            negs.append({'article_id': row['id'], 'article_venue': row['article_venue'], 'article_year': row['article_year'],
                         'text': row['text'], 'candidate': c, 'contributor': list(true_cont), 'label': 0})
    return pd.DataFrame(negs)

df_train_pos = expand_positives(df_train)
all_contrib = set([c for sublist in df_train['contributor'] for c in sublist])
df_train_neg = generate_negatives(df_train, all_contrib)
df_train_full = pd.concat([df_train_pos, df_train_neg], ignore_index=True)

def to_joined_string(x):
    if isinstance(x, list): return " ".join(map(str, x))
    else: return str(x)
df_train_full['text_str'] = df_train_full['text'].apply(to_joined_string)
df_train_full['cont_str'] = df_train_full['contributor'].apply(to_joined_string)
df_test['text_str'] = df_test['text'].apply(to_joined_string)
df_test['cont_str'] = df_test['contributor'].apply(to_joined_string)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

vectorizer_text = TfidfVectorizer(max_features=600, ngram_range=(1,2), analyzer='word')
vectorizer_cont = TfidfVectorizer(max_features=300, ngram_range=(1,2), analyzer='word')

Tfidf_text_train = vectorizer_text.fit_transform(df_train_full['text_str'])
Tfidf_cont_train = vectorizer_cont.fit_transform(df_train_full['cont_str'])
Tfidf_text_test = vectorizer_text.transform(df_test['text_str'])
Tfidf_cont_test = vectorizer_cont.transform(df_test['cont_str'])

df_train_full['article_venue'] = df_train_full['article_venue'].astype(str).replace('', 'missing')
df_test['article_venue'] = df_test['article_venue'].astype(str).replace('', 'missing')
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
venue_ohe_train = ohe.fit_transform(df_train_full[['article_venue']])
venue_ohe_test = ohe.transform(df_test[['article_venue']])
venue_cols = ohe.get_feature_names_out(['article_venue'])
venue_df_train = pd.DataFrame(venue_ohe_train, columns=venue_cols, index=df_train_full.index)
venue_df_test = pd.DataFrame(venue_ohe_test, columns=venue_cols, index=df_test.index)

def add_hist_features(row):
    c, venue, year, conts = row['candidate'], row['article_venue'], row['article_year'], row['contributor']
    venue_count = venue_contrib_history.get(c, {}).get(venue, 0)
    year_count = year_contrib_history.get(c, {}).get(year, 0)
    coauthor_count = sum(coauthor_counts.get(c, {}).get(other, 0) for other in conts)
    uniq_words = len(set(row['text_str'].split()))
    num_contrib = len(row['contributor'])
    text_len = len(row['text_str'])
    return pd.Series([venue_count, year_count, coauthor_count, uniq_words, num_contrib, text_len])

meta_cols = ['venue_count', 'year_count', 'coauthor_count', 'uniq_words', 'num_contrib', 'text_len']
df_train_full[meta_cols] = df_train_full.apply(add_hist_features, axis=1)
df_test[meta_cols] = df_test.apply(
    lambda r: pd.Series([
        venue_contrib_history.get(r['candidate'], {}).get(r['article_venue'], 0),
        year_contrib_history.get(r['candidate'], {}).get(r['article_year'], 0),
        sum(coauthor_counts.get(r['candidate'], {}).get(other, 0) for other in r['contributor']),
        len(set(r['text_str'].split())),
        len(r['contributor']),
        len(r['text_str'])
    ]), axis=1)

scaler = StandardScaler()
hist_train = scaler.fit_transform(df_train_full[meta_cols])
hist_test = scaler.transform(df_test[meta_cols])
hist_df_train = pd.DataFrame(hist_train, columns=meta_cols, index=df_train_full.index)
hist_df_test = pd.DataFrame(hist_test, columns=meta_cols, index=df_test.index)

tfidf_text_train_df = pd.DataFrame(Tfidf_text_train.toarray(), columns=[f"text__{f}" for f in vectorizer_text.get_feature_names_out()], index=df_train_full.index)
tfidf_cont_train_df = pd.DataFrame(Tfidf_cont_train.toarray(), columns=[f"cont__{f}" for f in vectorizer_cont.get_feature_names_out()], index=df_train_full.index)
tfidf_text_test_df = pd.DataFrame(Tfidf_text_test.toarray(), columns=[f"text__{f}" for f in vectorizer_text.get_feature_names_out()], index=df_test.index)
tfidf_cont_test_df = pd.DataFrame(Tfidf_cont_test.toarray(), columns=[f"cont__{f}" for f in vectorizer_cont.get_feature_names_out()], index=df_test.index)

train_features = pd.concat([venue_df_train.reset_index(drop=True), hist_df_train.reset_index(drop=True), 
                            tfidf_text_train_df.reset_index(drop=True), tfidf_cont_train_df.reset_index(drop=True)], axis=1)
test_features = pd.concat([venue_df_test.reset_index(drop=True), hist_df_test.reset_index(drop=True), 
                           tfidf_text_test_df.reset_index(drop=True), tfidf_cont_test_df.reset_index(drop=True)], axis=1)

for col in train_features.columns:
    if col not in test_features.columns: test_features[col] = 0
for col in list(test_features.columns):
    if col not in train_features.columns: test_features.drop(columns=col, inplace=True)
test_features = test_features[train_features.columns]
X = train_features.values.astype(np.float32)
y = df_train_full['label'].values.astype(int)
X_test = test_features.values.astype(np.float32)

# ========== FAST ENSEMBLE TRAINING & PREDICTION ==========
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

# XGBoost
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=10, learning_rate=0.037, subsample=0.9,
    colsample_bytree=0.8, reg_alpha=0.5, reg_lambda=2.0, n_jobs=-1, verbosity=1, random_state=42)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=30, verbose=100)
xgb_val = xgb_model.predict_proba(X_val)[:,1]
xgb_test = xgb_model.predict_proba(X_test)[:,1]

# RandomForest
rf_model = RandomForestClassifier(n_estimators=120, max_depth=11, n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)
rf_val = rf_model.predict_proba(X_val)[:,1]
rf_test = rf_model.predict_proba(X_test)[:,1]

# Logistic Regression
lr_model = LogisticRegression(max_iter=600, random_state=42)
lr_model.fit(X_train, y_train)
lr_val = lr_model.predict_proba(X_val)[:,1]
lr_test = lr_model.predict_proba(X_test)[:,1]

# Blend predictions
val_blend = (xgb_val + rf_val + lr_val) / 3
test_blend = (xgb_test + rf_test + lr_test) / 3

# Find best validation threshold
best_thr, best_acc = 0.5, 0
for thr in np.linspace(0.3, 0.7, 101):
    acc = accuracy_score(y_val, (val_blend > thr).astype(int))
    if acc > best_acc: best_thr, best_acc = thr, acc

print(f'Best validation accuracy (ensemble): {best_acc:.4f} at threshold {best_thr:.3f}')

y_pred = (test_blend > best_thr).astype(int)  

In [None]:
# y_pred= (preds > 0.5).astype(int)
df_pred = pd.DataFrame(y_pred, columns=['prediction'])
df_pred["id"] = range(1, len(df_pred) + 1)
df_pred.to_csv('ypredl11.csv', index=False)