In [1]:
from sklearn.model_selection import train_test_split
import re
import unicodedata2
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB


def get_best_thresholds(true, preds):
    thresholds = [i/100 for i in range(100)]
    best_thresholds = []
    for idx in range(25):
        f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1)for thresh in thresholds]
        best_thresh = thresholds[np.argmax(f1_scores)]
        best_thresholds.append(best_thresh)

    return best_thresholds


def clean_text(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]", " ", text)
    text = ' '.join(text.split())
    text = text.lower()

    return text


def score(target, predict):
    print("f1:", f1_score(target, predict, average='micro'))
    print("Precision:", precision_score(target, predict, average='micro'))
    print("Recall:", recall_score(target, predict, average='micro'))


stop_words = nltk.corpus.stopwords.words('english')+['n', 'k', 'x', 'c', 'r', 'h', 'g', 'p', 'inside', 'considering']

def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

TOPIC_COLS = ['Computer Science', 'Mathematics', 'Physics', 'Statistics']

TARGET_COLS = ['Analysis of PDEs', 'Applications',
               'Artificial Intelligence', 'Astrophysics of Galaxies',
               'Computation and Language', 'Computer Vision and Pattern Recognition',
               'Cosmology and Nongalactic Astrophysics',
               'Data Structures and Algorithms', 'Differential Geometry',
               'Earth and Planetary Astrophysics', 'Fluid Dynamics',
               'Information Theory', 'Instrumentation and Methods for Astrophysics',
               'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
               'Optimization and Control', 'Representation Theory', 'Robotics',
               'Social and Information Networks', 'Statistics Theory',
               'Strongly Correlated Electrons', 'Superconductivity',
               'Systems and Control']

In [3]:
train['ABSTRACT'] = train['ABSTRACT'].apply(lambda x: clean_text(x))
train['ABSTRACT'] = train['ABSTRACT'].apply(lambda x: remove_stopwords(x))

test['ABSTRACT'] = test['ABSTRACT'].apply(lambda x: clean_text(x))
test['ABSTRACT'] = test['ABSTRACT'].apply(lambda x: remove_stopwords(x))

In [9]:
def make_model(vec,clf):
    #vec = CountVectorizer(max_features=40000, ngram_range=(1, 2))
    combined = list(train['ABSTRACT']) + list(test['ABSTRACT'])
    vec.fit(combined)

    trn, val = train_test_split(train, test_size=0.2, random_state=2)

    trn_abs = vec.transform(trn['ABSTRACT'])
    val_abs = vec.transform(val['ABSTRACT'])
    tst_abs = vec.transform(test['ABSTRACT'])

    trn2 = np.hstack((trn_abs.toarray(), trn[TOPIC_COLS]))
    val2 = np.hstack((val_abs.toarray(), val[TOPIC_COLS]))
    tst2 = np.hstack((tst_abs.toarray(), test[TOPIC_COLS]))

    trn2 = csr_matrix(trn2.astype('int16'))
    val2 = csr_matrix(val2.astype('int16'))
    tst2 = csr_matrix(tst2.astype('int16'))

    #clf = OneVsRestClassifier(LogisticRegression(C=1, n_jobs=-1))
    clf.fit(trn2, trn[TARGET_COLS])

    val_preds = clf.predict_proba(val2)

    best_thresholds = get_best_thresholds(val[TARGET_COLS].values, val_preds)

    for i, thresh in enumerate(best_thresholds):
        val_preds[:, i] = (val_preds[:, i] > thresh) * 1
        
    print(score(val[TARGET_COLS], val_preds))

    preds_test = clf.predict_proba(tst2)

    for i, thresh in enumerate(best_thresholds):
        preds_test[:, i] = (preds_test[:, i] > thresh) * 1
        
    return preds_test

In [5]:
preds_test1 = make_model(CountVectorizer(max_features=40000, ngram_range=(1, 2),OneVsRestClassifier(LogisticRegression(C=1, n_jobs=-1))))

f1: 0.7451329685710649
Precision: 0.7127555988315482
Recall: 0.7805918421754199
None


In [6]:
preds_test2=make_model(CountVectorizer(max_features=50000, ngram_range=(1, 3),OneVsRestClassifier(LogisticRegression(C=1, n_jobs=-1))))

f1: 0.7458605117912693
Precision: 0.7043354655294953
Recall: 0.7925886430285257
None


In [10]:
from sklearn.naive_bayes import GaussianNB

preds_test3=make_model(CountVectorizer(max_features=50000, ngram_range=(1, 3)),OneVsRestClassifier(GaussianNB()))

TypeError: len() of unsized object