In [23]:
from sklearn.model_selection import train_test_split
import re
import unicodedata2
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import csr_matrix


In [24]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [25]:
def basic_clean(text):
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    text = (unicodedata2.normalize('NFKD', text)
            .encode('ascii', 'ignore')
            .decode('utf-8', 'ignore')
            .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [26]:
words_tr = basic_clean(''.join(str(train['ABSTRACT'].tolist())))
words_tst = basic_clean(''.join(str(test['ABSTRACT'].tolist())))

In [27]:
target_col=['Analysis of PDEs', 'Applications',
       'Artificial Intelligence', 'Astrophysics of Galaxies',
       'Computation and Language', 'Computer Vision and Pattern Recognition',
       'Cosmology and Nongalactic Astrophysics',
       'Data Structures and Algorithms', 'Differential Geometry',
       'Earth and Planetary Astrophysics', 'Fluid Dynamics',
       'Information Theory', 'Instrumentation and Methods for Astrophysics',
       'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
       'Optimization and Control', 'Representation Theory', 'Robotics',
       'Social and Information Networks', 'Statistics Theory',
       'Strongly Correlated Electrons', 'Superconductivity',
       'Systems and Control']

In [28]:
topic_col=['Computer Science', 'Mathematics', 'Physics', 'Statistics']

In [29]:
def get_best_thresholds(true, preds):
    thresholds = [i/100 for i in range(100)]
    best_thresholds = []
    for idx in range(25):
        f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1) for thresh in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_thresholds.append(best_thresh)
        
    return best_thresholds

In [30]:
trn, val = train_test_split(train, test_size=0.2, random_state=2)

In [31]:
vec = CountVectorizer(max_features=30000)
combined = words_tr+words_tst
vec.fit(combined)

trn_abs = vec.transform(trn['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
tst_abs = vec.transform(test['ABSTRACT'])

trn1 = np.hstack((trn_abs.toarray(), trn[topic_col]))
val1 = np.hstack((val_abs.toarray(), val[topic_col]))
tst1 = np.hstack((tst_abs.toarray(), test[topic_col]))


In [32]:
trn1 = csr_matrix(trn1.astype('int16'))
val1 = csr_matrix(val1.astype('int16'))
tst1 = csr_matrix(tst1.astype('int16'))

In [33]:
%%time

clf1 = OneVsRestClassifier(LogisticRegression(C = 2, n_jobs=-1))
clf1.fit(trn1, trn[target_col])

val_preds = clf1.predict_proba(val1)
best_thresholds = get_best_thresholds(val[target_col].values, val_preds)

for i, thresh in enumerate(best_thresholds):
  val_preds[:, i] = (val_preds[:, i] > thresh) * 1

f1_score(val[target_col], val_preds, average='micro')

Wall time: 33.9 s


In [34]:
f1_score(val[target_col], val_preds, average='micro')

0.7257266150526717

In [35]:
preds_test = clf.predict_proba(tst2)

for i, thresh in enumerate(best_thresholds):
    preds_test[:, i] = (preds_test[:, i] > thresh) * 1

In [None]:
vec2 = CountVectorizer(max_features=40000)
_ = vec2.fit(list(train['ABSTRACT']) + list(test['ABSTRACT']))

trn_abs = vec2.transform(trn['ABSTRACT'])
val_abs = vec2.transform(val['ABSTRACT'])
tst_abs = vec2.transform(test['ABSTRACT'])
print(trn_abs.shape, val_abs.shape, tst_abs.shape)

trn2 = np.hstack((trn_abs.toarray(), trn[topic_col]))
val2 = np.hstack((val_abs.toarray(), val[topic_col]))
tst2 = np.hstack((tst_abs.toarray(), test[topic_col]))

In [None]:
trn2 = csr_matrix(trn2.astype('int16'))
val2 = csr_matrix(val2.astype('int16'))
tst2 = csr_matrix(tst2.astype('int16'))

In [None]:
clf2 = OneVsRestClassifier(LogisticRegression(C = 2, n_jobs=-1))
clf2.fit(trn2, trn[target_col])

val_preds = clf2.predict_proba(val2)
best_thresholds = get_best_thresholds(val[target_col].values, val_preds)

for i, thresh in enumerate(best_thresholds):
  val_preds[:, i] = (val_preds[:, i] > thresh) * 1

f1_score(val[target_col], val_preds, average='micro')

In [36]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission[target_col] = preds_test
submission.to_csv('sol4.csv', header=True, index=False)