In [1]:
from sklearn.model_selection import train_test_split
import re
import unicodedata2
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

STOP=['and','have','been','is','are','a','an','the','or','has','many']
def basic_clean(text):
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')+STOP
    text = (unicodedata2.normalize('NFKD', text)
            .encode('ascii', 'ignore')
            .decode('utf-8', 'ignore')
            .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    #return words
    return [wnl.lemmatize(word) for word in words if word not in stopwords]


def get_best_thresholds(true, preds):
    thresholds = [i/100 for i in range(100)]
    best_thresholds = []
    for idx in range(25):
        f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1)
                     for thresh in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_thresholds.append(best_thresh)

    return best_thresholds

In [2]:
target_col = ['Analysis of PDEs', 'Applications',
              'Artificial Intelligence', 'Astrophysics of Galaxies',
              'Computation and Language', 'Computer Vision and Pattern Recognition',
              'Cosmology and Nongalactic Astrophysics',
              'Data Structures and Algorithms', 'Differential Geometry',
              'Earth and Planetary Astrophysics', 'Fluid Dynamics',
              'Information Theory', 'Instrumentation and Methods for Astrophysics',
              'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
              'Optimization and Control', 'Representation Theory', 'Robotics',
              'Social and Information Networks', 'Statistics Theory',
              'Strongly Correlated Electrons', 'Superconductivity',
              'Systems and Control']


topic_col = ['Computer Science', 'Mathematics', 'Physics', 'Statistics']

In [3]:
trn, val = train_test_split(train, test_size=0.2, random_state=2)

In [4]:
words_tr = basic_clean(''.join(str(train['ABSTRACT'].tolist())))
words_tst = basic_clean(''.join(str(test['ABSTRACT'].tolist())))

In [5]:
val['ABSTRACT']=val['ABSTRACT'].map(lambda txt:basic_clean(txt))

In [6]:
val['ABSTRACT']

10530    [prove, existence, hermitianeinstein, metric, ...
3174     [novel, predictor, considering, traffic, flow,...
3240     [inside, paper, introduce, kumaraswamy, autore...
8368     [recent, financial, upheaval, cast, doubt, ade...
11359    [analyze, generalized, dirac, system, dispersi...
                               ...                        
2471     [surge, inside, availability, genomic, data, h...
7509     [new, type, endtoend, system, considering, tex...
3090     [central, question, inside, science, science, ...
6145     [paper, study, empirical, risk, minimization, ...
7614     [oneopposition, nearearth, asteroid, neas, gro...
Name: ABSTRACT, Length: 2801, dtype: object

In [13]:
def twog(txt):
    words_ts=pd.Series(nltk.ngrams(txt, 2))
    words_tst=[]
    for i in range(len(words_ts)):
        words_tst.append(words_ts[i][0])
        
    #words_tst.append(words_ts[len(words_ts)][1])

    return words_tst

In [14]:
temp=twog(val['ABSTRACT'][0])

In [17]:
val['ABSTRACT']=val['ABSTRACT'].map(lambda txt:twog(txt))

10530    [prove, existence, hermitianeinstein, metric, ...
3174     [novel, predictor, considering, traffic, flow,...
3240     [inside, paper, introduce, kumaraswamy, autore...
8368     [recent, financial, upheaval, cast, doubt, ade...
11359    [analyze, generalized, dirac, system, dispersi...
Name: ABSTRACT, dtype: object

In [None]:
trn['ABSTRACT']=trn['ABSTRACT'].map(lambda txt:basic_clean(txt))
trn['ABSTRACT']=trn['ABSTRACT'].map(lambda txt:twog(txt))

In [None]:
vec3 = TfidfVectorizer(max_features=10000)
_ = vec3.fit(list(train['ABSTRACT']))

trn_abs = vec3.transform(trn['ABSTRACT'])
val_abs = vec3.transform(val['ABSTRACT'])
#tst_abs = vec3.transform(test['ABSTRACT'])

trn3 = np.hstack((trn_abs.toarray(), trn[topic_col]))
val3 = np.hstack((val_abs.toarray(), val[topic_col]))
#tst3 = np.hstack((tst_abs.toarray(), test[topic_col]))

trn3 = csr_matrix(trn3.astype('int16'))
val3 = csr_matrix(val3.astype('int16'))
#tst3 = csr_matrix(tst3.astype('int16'))