In [1]:
import pandas as pd
import numpy as np
import csv
# feature
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline

# models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import torch

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_venue = pd.read_csv('/content/train-lishuxu-predict_venue-title+abstract.csv', sep=';')
train_venue.head()

Unnamed: 0,id,title,abstract,label
0,L12-1253,A Repository for the Sustainable Management of...,This paper presents the system architecture as...,LREC
1,2020.emnlp-main.526,Substance over Style: Document-Level Targeted ...,Existing language models excel at writing from...,EMNLP
2,2021.emnlp-main.317,To be Closer: Learning to Link up Aspects with...,Dependency parse trees are helpful for discove...,EMNLP
3,L12-1613,Constraint Based Description of Polish Multiwo...,We present an approach to the description of P...,LREC
4,2020.emnlp-main.135,Tired of Topic Models? Clusters of Pretrained ...,Topic models are a useful analysis tool to unc...,EMNLP


In [3]:
test_venue = pd.read_csv('/content/test-lishuxu-predict_venue-title+abstract.csv', sep=';')
test_venue.head()

Unnamed: 0,id,title,abstract,label
0,D18-1415,A Teacher-Student Framework for Maintainable D...,Reinforcement learning (RL) is an attractive s...,EMNLP
1,2021.emnlp-main.746,Bridge to Target Domain by Prototypical Contra...,Zero-shot cross-domain slot filling alleviates...,EMNLP
2,2020.acl-main.181,What determines the order of adjectives in Eng...,We take up the scientific question of what det...,ACL
3,2021.emnlp-main.780,Exploiting Twitter as Source of Large Corpora ...,Semantic sentence embeddings are usually super...,EMNLP
4,2022.lrec-1.141,NyLLex: A Novel Resource of Swedish Words Anno...,"What makes a text easy to read or not, depends...",LREC


In [4]:
train_venue['text'] = train_venue['title'] + ' ' + train_venue['abstract']
test_venue['text'] = test_venue['title'] + ' ' + test_venue['abstract']

X_train = train_venue['text']
y_train = train_venue['label']
X_test = test_venue['text']
y_test = test_venue['label']

In [6]:
count_uni = Pipeline([('count_uni_lr_venue', CountVectorizer()), ('clf', LogisticRegression(max_iter=1000))])

In [31]:
count_uni.fit(X_train, y_train)

y_pred = count_uni.predict(X_test)
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

         ACL       0.48      0.47      0.48       500
       EMNLP       0.47      0.49      0.48       500
        LREC       0.79      0.77      0.78       500

    accuracy                           0.58      1500
   macro avg       0.58      0.58      0.58      1500
weighted avg       0.58      0.58      0.58      1500



In [32]:
count_bi = Pipeline([('count_bi_lr_venue', CountVectorizer(ngram_range=(1,2))), ('clf', LogisticRegression(max_iter=1000))])

count_bi.fit(X_train, y_train)

y_pred_bi_lr = count_bi.predict(X_test)
print(accuracy_score(y_test, y_pred_bi_lr))

              precision    recall  f1-score   support

         ACL       0.50      0.48      0.49       500
       EMNLP       0.51      0.53      0.52       500
        LREC       0.82      0.83      0.82       500

    accuracy                           0.61      1500
   macro avg       0.61      0.61      0.61      1500
weighted avg       0.61      0.61      0.61      1500



In [20]:
count_tri = Pipeline([('count_tri_lr_venue', CountVectorizer(ngram_range=(2,3))), ('clf', LogisticRegression(max_iter=1000))])

count_tri.fit(X_train, y_train)

y_pred_tri_lr = count_tri.predict(X_test)
print(accuracy_score(y_test, y_pred_tri_lr))

              precision    recall  f1-score   support

         ACL       0.51      0.50      0.50       500
       EMNLP       0.51      0.48      0.49       500
        LREC       0.78      0.84      0.81       500

    accuracy                           0.61      1500
   macro avg       0.60      0.61      0.60      1500
weighted avg       0.60      0.61      0.60      1500



## SVC


In [33]:
count_uni_svc = Pipeline([('count_uni_svc_venue', CountVectorizer()), ('clf', SVC())])

count_uni_svc.fit(X_train, y_train)

y_pred_svc = count_uni_svc.predict(X_test)
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

         ACL       0.51      0.56      0.53       500
       EMNLP       0.54      0.48      0.51       500
        LREC       0.81      0.82      0.82       500

    accuracy                           0.62      1500
   macro avg       0.62      0.62      0.62      1500
weighted avg       0.62      0.62      0.62      1500



In [36]:
count_bi_svc = Pipeline([('count_bi_svc_venue', CountVectorizer(ngram_range=(1,2))), ('clf', SVC())])

count_bi_svc.fit(X_train, y_train)

y_pred_svc_bi = count_bi_svc.predict(X_test)
print(classification_report(y_test, y_pred_svc_bi))

              precision    recall  f1-score   support

         ACL       0.51      0.59      0.55       500
       EMNLP       0.55      0.45      0.50       500
        LREC       0.81      0.83      0.82       500

    accuracy                           0.63      1500
   macro avg       0.63      0.63      0.62      1500
weighted avg       0.63      0.63      0.62      1500



In [35]:
count_tri_svc = Pipeline([('count_tri_svc_venue', CountVectorizer(ngram_range=(1,3))), ('clf', SVC())])

count_tri_svc.fit(X_train, y_train)

y_pred_svc_tri = count_tri_svc.predict(X_test)
print(classification_report(y_test, y_pred_svc_tri))

              precision    recall  f1-score   support

         ACL       0.45      0.91      0.61       500
       EMNLP       0.68      0.12      0.21       500
        LREC       0.86      0.70      0.77       500

    accuracy                           0.58      1500
   macro avg       0.67      0.58      0.53      1500
weighted avg       0.67      0.58      0.53      1500



In [15]:
train_length = pd.read_csv('/content/train-lishuxu-article_length-title+abstract.csv', sep=';')
train_length = train_length.dropna()
train_length.head()

Unnamed: 0,id,title,abstract,label
0,2022.coling-1.574,Measuring Geographic Performance Disparities o...,Text classifiers are applied at scale in the f...,long
1,2021.acl-long.60,From Discourse to Narrative: Knowledge Project...,Current event-centric knowledge graphs highly ...,long
2,2022.dclrl-1.4,Building an Icelandic Entity Linking Corpus,"In this paper, we present the first Entity Lin...",long
3,N18-1001,Label-Aware Double Transfer Learning for Cross...,We study the problem of named entity recogniti...,long
4,2021.ranlp-1.10,A Multi-Pass Sieve Coreference Resolution for ...,Coreference resolution is an NLP task to find ...,short


In [14]:
test_length = pd.read_csv('/content/test-lishuxu-article_length-title+abstract.csv', sep=';')
test_length = test_length.dropna()
test_length.head()

Unnamed: 0,id,title,abstract,label
0,D18-1510,Greedy Search with Probabilistic N-gram Matchi...,Neural machine translation (NMT) models are us...,short
1,L14-1077,Creating Summarization Systems with SUMMA,"Automatic text summarization, the reduction of...",short
2,2020.aacl-main.65,DAPPER: Learning Domain-Adapted Persona Repres...,Research in building intelligent agents have e...,long
3,2020.coling-main.193,Generating Diverse Corrections with Local Beam...,"In this study, we propose a beam search method...",short
4,W19-4327,Best Practices for Learning Domain-Specific Cr...,Cross-lingual embeddings aim to represent word...,short


In [17]:
X_train_length = train_length['title'] + ' ' + train_length['abstract']
y_train_length = train_length['label']
X_test_length = test_length['title'] + ' ' + test_length['abstract']
y_test_length = test_length['label']

In [18]:
count_uni_length = Pipeline([('count_uni', CountVectorizer()), ('clf', LogisticRegression(max_iter=1000))])

count_uni_length.fit(X_train_length, y_train_length)

y_pred_length = count_uni_length.predict(X_test_length)
print(classification_report(y_test_length, y_pred_length))


              precision    recall  f1-score   support

        long       0.66      0.63      0.64      1598
       short       0.64      0.66      0.65      1591

    accuracy                           0.65      3189
   macro avg       0.65      0.65      0.65      3189
weighted avg       0.65      0.65      0.65      3189



In [22]:
count_bi_length = Pipeline([('count_bi', CountVectorizer(ngram_range=(1,2))), ('clf', LogisticRegression(max_iter=1000))])

count_bi_length.fit(X_train_length, y_train_length)

y_pred_length2 = count_bi_length.predict(X_test_length)
print(accuracy_score(y_test_length, y_pred_length2))

              precision    recall  f1-score   support

        long       0.69      0.67      0.68      1598
       short       0.68      0.69      0.68      1591

    accuracy                           0.68      3189
   macro avg       0.68      0.68      0.68      3189
weighted avg       0.68      0.68      0.68      3189



In [23]:
count_tri_length = Pipeline([('count_tri', CountVectorizer(ngram_range=(2,3))), ('clf', LogisticRegression(max_iter=1000))])

count_tri_length.fit(X_train_length, y_train_length)

y_pred_length3 = count_tri_length.predict(X_test_length)
print(accuracy_score(y_test_length, y_pred_length3))



              precision    recall  f1-score   support

        long       0.69      0.68      0.68      1598
       short       0.68      0.69      0.68      1591

    accuracy                           0.68      3189
   macro avg       0.68      0.68      0.68      3189
weighted avg       0.68      0.68      0.68      3189



In [37]:
count_uni_svc_length = Pipeline([('count_uni_svc', CountVectorizer()), ('clf', SVC())])

count_uni_svc_length.fit(X_train_length, y_train_length)

y_pred_length_svc = count_uni_svc_length.predict(X_test_length)
print(accuracy_score(y_test_length, y_pred_length_svc))

              precision    recall  f1-score   support

        long       0.70      0.69      0.69      1598
       short       0.69      0.71      0.70      1591

    accuracy                           0.70      3189
   macro avg       0.70      0.70      0.70      3189
weighted avg       0.70      0.70      0.70      3189



In [38]:
count_bi_svc_length = Pipeline([('count_bi_svc', CountVectorizer(ngram_range=(1,2))), ('clf', SVC())])

count_bi_svc_length.fit(X_train_length, y_train_length)

y_pred_length_svc2 = count_bi_svc_length.predict(X_test_length)
print(accuracy_score(y_test_length, y_pred_length_svc2))

              precision    recall  f1-score   support

        long       0.71      0.68      0.69      1598
       short       0.69      0.72      0.70      1591

    accuracy                           0.70      3189
   macro avg       0.70      0.70      0.70      3189
weighted avg       0.70      0.70      0.70      3189



In [None]:
count_tri_svc_length = Pipeline([('count_tri_svc', CountVectorizer(ngram_range=(1,3))), ('clf', SVC())])

count_tri_svc_length.fit(X_train_length, y_train_length)

y_pred_length_svc3 = count_tri_svc_length.predict(X_test_length)
print(accuracy_score(y_test_length, y_pred_length_svc3))

In [None]:
print(accuracy_score(y_test_length, y_pred_length_svc3))