In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import re

In [None]:
print('pandas: ', pd.__version__)
print('matplotlib: ', matplotlib.__version__)

pandas:  1.0.5
matplotlib:  3.2.2


In [None]:
df_train = pd.read_csv('./data/train.csv')
df_train.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
df_train['TITLE_PROCESSED'] = df_train['TITLE'].apply(lambda text: clean_text(text))

In [None]:
df_train['ABSTRACT_PROCESSED'] = df_train['ABSTRACT'].apply(lambda text: clean_text(text))

In [None]:
from sklearn.model_selection import train_test_split

categories = df_train.columns[3:-2].values
train, test = train_test_split(df_train, random_state=42, test_size=0.33, shuffle=True)
X_train = train[['TITLE_PROCESSED', 'ABSTRACT_PROCESSED']]
X_test = test[['TITLE_PROCESSED', 'ABSTRACT_PROCESSED']]
print(X_train.shape)
print(X_test.shape)

(14051, 2)
(6921, 2)


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.compose import make_column_transformer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

preprocessing = FeatureUnion([
                ('title_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['TITLE_PROCESSED'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer(stop_words=stop_words))])),
                ('abstract_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['ABSTRACT_PROCESSED'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer(stop_words=stop_words))]))]) 
NB_pipeline = Pipeline([
                ('preprocessing', preprocessing),
                ('clf',
                  OneVsRestClassifier(LogisticRegression())
                ),
            ])

NB_pipeline.fit(X_train, train[categories])

Pipeline(steps=[('preprocessing',
                 FeatureUnion(transformer_list=[('title_tfidf',
                                                 Pipeline(steps=[('extract_field',
                                                                  FunctionTransformer(func=<function <lambda> at 0x7f712485e048>)),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(stop_words={'a',
                                                                                              'about',
                                                                                              'above',
                                                                                              'after',
                                                                                              'again',
                                                                                              'agains

In [None]:
prediction = NB_pipeline.predict(X_test)
f1_score(test[categories], prediction, average='micro')

0.8016939790093905

In [None]:
# Train on all data and make predictions
X = df_train[['TITLE_PROCESSED', 'ABSTRACT_PROCESSED']]
y = df_train[categories]

In [None]:
NB_pipeline.fit(X, y)

Pipeline(steps=[('preprocessing',
                 FeatureUnion(transformer_list=[('title_tfidf',
                                                 Pipeline(steps=[('extract_field',
                                                                  FunctionTransformer(func=<function <lambda> at 0x7f712485e048>)),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(stop_words={'a',
                                                                                              'about',
                                                                                              'above',
                                                                                              'after',
                                                                                              'again',
                                                                                              'agains

In [None]:
df_test = pd.read_csv('./data/test.csv')
df_test['TITLE_PROCESSED'] = df_test['TITLE'].apply(lambda text: clean_text(text))
df_test['ABSTRACT_PROCESSED'] = df_test['ABSTRACT'].apply(lambda text: clean_text(text))
df_test.head()

Unnamed: 0,ID,TITLE,ABSTRACT,TITLE_PROCESSED,ABSTRACT_PROCESSED
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...,closed form marginal likelihood in gamma poiss...,we present novel understandings of the gamma p...
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...,laboratory mid ir spectra of equilibrated and ...,meteorites contain minerals from solar system ...
2,20975,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...,case for static amsdu aggregation in wlans,frame aggregation is a mechanism by which mult...
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...,the gaia eso survey the inner disk intermediat...,milky way open clusters are very diverse in te...
4,20977,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...,witness functions versus interpretation functi...,proving that a cryptographic protocol is corre...


In [None]:
predictions = NB_pipeline.predict(df_test[['TITLE_PROCESSED', 'ABSTRACT_PROCESSED']])

In [None]:
predictions

array([[0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0]])

In [None]:
df_predictions = pd.DataFrame(predictions, index=df_test.ID, columns=[
    'Computer Science', 'Physics', 'Mathematics',
    'Statistics', 'Quantitative Biology', 'Quantitative Finance']
)
df_predictions.head()

Unnamed: 0_level_0,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20973,0,0,0,1,0,0
20974,0,1,0,0,0,0
20975,1,0,0,0,0,0
20976,0,1,0,0,0,0
20977,1,0,0,0,0,0


In [None]:
df_predictions.to_csv('predictions_v1.csv', index_label='ID')