In [1]:
from sklearn.model_selection import train_test_split
import re
import unicodedata2
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import numpy as np
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

STOP=['and','have','been','is','are','a','an','the','or','has','many']
def basic_clean(text):
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')+STOP
    text = (unicodedata2.normalize('NFKD', text)
            .encode('ascii', 'ignore')
            .decode('utf-8', 'ignore')
            .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    #return words
    return [wnl.lemmatize(word) for word in words if word not in stopwords]


def get_best_thresholds(true, preds):
    thresholds = [i/100 for i in range(100)]
    best_thresholds = []
    for idx in range(25):
        f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1)
                     for thresh in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_thresholds.append(best_thresh)

    return best_thresholds

In [2]:
def clean_text(text):
    text = re.sub("\'", "", text) 
    text = re.sub("[^a-zA-Z]"," ",text) 
    text = ' '.join(text.split())  
    text = text.lower() 
    
    return text

In [3]:
train['ABSTRACT']=train['ABSTRACT'].apply(lambda x: clean_text(x))

In [4]:
train['ABSTRACT']

0        a ever growing datasets inside observational a...
1        we propose the framework considering optimal t...
2        nanostructures with open shell transition meta...
3        stars are self gravitating fluids inside which...
4        deep neural perception and control networks ar...
                               ...                        
13999    a methodology of automatic detection of a even...
14000    we consider a case inside which the robot has ...
14001    despite being usually considered two competing...
14002    we present the framework and its implementatio...
14003    here we report small angle neutron scattering ...
Name: ABSTRACT, Length: 14004, dtype: object

In [5]:
stop_words =  nltk.corpus.stopwords.words('english')+['n','k','x','c','r','h','g','p','inside','considering']
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [6]:
train['ABSTRACT']=train['ABSTRACT'].apply(lambda x: remove_stopwords(x))

In [7]:
train['ABSTRACT']

0        ever growing datasets observational astronomy ...
1        propose framework optimal matchings excluding ...
2        nanostructures open shell transition metal mol...
3        stars self gravitating fluids pressure buoyanc...
4        deep neural perception control networks likely...
                               ...                        
13999    methodology automatic detection event basis in...
14000    consider case robot navigate unknown environme...
14001    despite usually considered two competing pheno...
14002    present framework implementation relying natur...
14003    report small angle neutron scattering sans mea...
Name: ABSTRACT, Length: 14004, dtype: object

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
def freq_words(x, terms = 30): 
  all_words = ' '.join([text for text in x]) 
  all_words = all_words.split() 
  fdist = nltk.FreqDist(all_words) 
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 
  
  # selecting top 20 most frequent words 
  d = words_df.nlargest(columns="count", n = terms) 
  
  # visualize words and frequencies
  plt.figure(figsize=(12,15)) 
  ax = sns.barplot(data=d, x= "count", y = "word") 
  ax.set(ylabel = 'Word') 
  plt.show()

In [10]:
test['ABSTRACT']=test['ABSTRACT'].apply(lambda x: clean_text(x))
test['ABSTRACT']=test['ABSTRACT'].apply(lambda x: remove_stopwords(x))


In [11]:
vec = CountVectorizer(max_features=40000)
combined = list(train['ABSTRACT']) + list(test['ABSTRACT'])
vec.fit(combined)

CountVectorizer(max_features=40000)

In [12]:
trn, val = train_test_split(train, test_size=0.2, random_state=2)

In [14]:
TOPIC_COLS=['Computer Science', 'Mathematics', 'Physics', 'Statistics']

In [15]:
trn_abs = vec.transform(trn['ABSTRACT'])
val_abs = vec.transform(val['ABSTRACT'])
tst_abs = vec.transform(test['ABSTRACT'])

trn2 = np.hstack((trn_abs.toarray(), trn[TOPIC_COLS]))
val2 = np.hstack((val_abs.toarray(), val[TOPIC_COLS]))
tst2 = np.hstack((tst_abs.toarray(), test[TOPIC_COLS]))


In [16]:
print(trn2.shape, val2.shape, tst2.shape)

(11203, 40004) (2801, 40004) (6002, 40004)


In [17]:
trn2 = csr_matrix(trn2.astype('int16'))
val2 = csr_matrix(val2.astype('int16'))
tst2 = csr_matrix(tst2.astype('int16'))

In [18]:
trn2.dtype

dtype('int16')

In [19]:
TARGET_COLS=['Analysis of PDEs', 'Applications',
       'Artificial Intelligence', 'Astrophysics of Galaxies',
       'Computation and Language', 'Computer Vision and Pattern Recognition',
       'Cosmology and Nongalactic Astrophysics',
       'Data Structures and Algorithms', 'Differential Geometry',
       'Earth and Planetary Astrophysics', 'Fluid Dynamics',
       'Information Theory', 'Instrumentation and Methods for Astrophysics',
       'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
       'Optimization and Control', 'Representation Theory', 'Robotics',
       'Social and Information Networks', 'Statistics Theory',
       'Strongly Correlated Electrons', 'Superconductivity',
       'Systems and Control']

In [20]:
%%time
clf = OneVsRestClassifier(LogisticRegression(C = 2, n_jobs=-1))
_  = clf.fit(trn2, trn[TARGET_COLS])

Wall time: 32.3 s


In [22]:
target_col=TARGET_COLS

In [55]:
val_preds = clf.predict_proba(val2)

In [56]:
best_thresholds = get_best_thresholds(val[target_col].values, val_preds)

for i, thresh in enumerate(best_thresholds):
    val_preds[:, i] = (val_preds[:, i] > thresh) * 1

In [57]:
def score(target,predict):
    print("f1:", f1_score(target, predict, average='micro'))
    print("Precision:", precision_score(target, predict, average='micro'))
    print("Recall:",recall_score(target, predict, average='micro'))

In [58]:
score(val[target_col], val_preds)

f1: 0.7269837388433794
Precision: 0.6714092140921409
Recall: 0.7925886430285257


In [27]:
preds_test = clf.predict_proba(tst2)

for i, thresh in enumerate(best_thresholds):
  preds_test[:, i] = (preds_test[:, i] > thresh) * 1

In [28]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission[target_col] = preds_test
submission.to_csv('sol7.csv', header=True, index=False)

In [29]:
train['text'] = ' '
test['text'] = ' '


train['text'] += train['ABSTRACT']
test['text'] += test['ABSTRACT']

trn, val = train_test_split(train, test_size=0.2, random_state=2)

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tok = Tokenizer(num_words = 1000000)
tok.fit_on_texts(train['text'].str.lower().tolist() + test['text'].str.lower().tolist())

vocab_size = len(tok.word_index) + 1

In [31]:
X_trn = tok.texts_to_sequences(trn['text'])
X_val = tok.texts_to_sequences(val['text'])
X_test = tok.texts_to_sequences(test['text'])

In [35]:
maxlen = 200
X_trn = pad_sequences(X_trn, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [46]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, SpatialDropout1D, LSTM
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Convolution1D


embedding_dim = 50
vocab_size = len(tok.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    input_length=maxlen))

model.add(Conv1D(64, 3, activation='sigmoid'))
model.add(Conv1D(100, 3, activation='sigmoid'))
model.add(Conv1D(100, 3, activation='sigmoid'))
# model.add(Dropout(0.70))
model.add(Conv1D(48, 3, activation='sigmoid'))
model.add(Flatten())
model.add(Dense(25, activation='sigmoid', name = 'Output'))
model.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3),
              loss='binary_crossentropy',
              metrics=['accuracy'],
              )

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 50)           2259550   
_________________________________________________________________
conv1d (Conv1D)              (None, 198, 64)           9664      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 196, 100)          19300     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 194, 100)          30100     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 192, 48)           14448     
_________________________________________________________________
flatten_3 (Flatten)          (None, 9216)              0         
_________________________________________________________________
Output (Dense)               (None, 25)               

In [47]:
model.fit(X_trn, trn[TARGET_COLS], validation_data=(X_val, val[TARGET_COLS]), verbose=True, epochs=32, batch_size=256,
          callbacks = [tf.keras.callbacks.ReduceLROnPlateau()])

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x1b05c891cf8>

In [51]:
val_preds1 = model.predict(X_val)
best_thresholds = get_best_thresholds(val[TARGET_COLS].values, val_preds1)
for i, thresh in enumerate(best_thresholds):
  val_preds1[:, i] = (val_preds[:, i] > thresh) * 1
f1_score(val[TARGET_COLS], val_preds, average='micro')

0.10168618520928216

In [52]:
score(val[target_col], val_preds1)

f1: 0.10168618520928216
Precision: 0.05356658336308461
Recall: 1.0


In [63]:
pred=(10*val_preds+val_preds1)/11

In [64]:
for i, thresh in enumerate(best_thresholds):
  pred[:, i] = (pred[:, i] > thresh) * 1

In [65]:
score(val[target_col], pred)

f1: 0.34179559816953586
Precision: 0.21478945566586785
Recall: 0.8363103172487337
