# Import Data

In [21]:
import json
import ast
import sys
import os
import pandas as pd
import nltk
import re

if not os.getcwd().endswith('UROP_mat2vec'):
    print('Please ensure that you are in the proper working directory')

from utils.process import MaterialsTextProcessor
text_processor = MaterialsTextProcessor()

with open('data/relevant_articles_material_science.json') as json_file:
    relevant = json.load(json_file)

relevant_articles = ast.literal_eval(relevant)
relevant_df = pd.DataFrame({'abstracts' : relevant_articles})

with open('data/irrelevant_articles.json') as json_file:
    irrelevant = json.load(json_file)
    
irrelevant_articles = [i for i in irrelevant if i != '']

for fname in os.listdir('data/'):
    if 'irrelevant_articles_' in fname:
        with open('data/' + fname) as json_file:
            irrelevant_articles += ast.literal_eval(json.load(json_file))
            
irrelevant_df = pd.DataFrame({'abstracts' : irrelevant_articles})

# Process Abstracts

1. Remove foreign language
2. Remove common words like "Abstracts"


In [22]:
words = set(nltk.corpus.words.words())

def language_checker(string):
    remove_foreign = " ".join(w for w in nltk.wordpunct_tokenize(string) if w.lower() in words or not w.isalpha())
    # If more than 30% of words are cut, remove article, else keep
    if len(remove_foreign)/len(string) <= 0.5:

        return ""
    else:
        return string
    
def remove_weblinks(string):
    return re.sub('https?://[A-Za-z0-9./]+','', string)
    
relevant_df = relevant_df[relevant_df['abstracts'].apply(language_checker) != ""]
irrelevant_df = irrelevant_df[irrelevant_df['abstracts'].apply(language_checker) != ""]
relevant_df = relevant_df[relevant_df['abstracts'].apply(remove_weblinks) != ""]
irrelevant_df = irrelevant_df[irrelevant_df['abstracts'].apply(remove_weblinks) != ""]
irrelevant_df['abstracts'] = irrelevant_df['abstracts'].apply(lambda x: x.replace("Background", ""))
irrelevant_df['abstracts'] = irrelevant_df['abstracts'].apply(lambda x: x.replace("Abstract", ""))
relevant_df['abstracts'] = relevant_df['abstracts'].apply(lambda x: x.replace("Background", ""))
relevant_df['abstracts'] = relevant_df['abstracts'].apply(lambda x: x.replace("Abstract", ""))
relevant_df['label'] = 1
irrelevant_df['label'] = 0
df = pd.concat([relevant_df, irrelevant_df])

# Train Test split

In [23]:
from sklearn.model_selection import train_test_split

x = df.abstracts
y = df.label

SEED = 2000
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=SEED)

In [24]:
print(y_train[y_train == 1].shape)
print(y_train[y_train == 0].shape)

print(y_test[y_test == 1].shape)
print(y_test[y_test == 0].shape)

(714,)
(3206,)
(304,)
(1376,)


# Pipeline: TF-IDF -> Classifier

## Trying out different classifiers to get the best results

In [25]:
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from time import time
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from xgboost import XGBClassifier

names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection", "Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Nearest Centroid", "XGBoost"]
classifiers = [LogisticRegression(),
    LinearSVC(),
    Pipeline([('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))), ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    NearestCentroid(), 
    XGBClassifier(random_state=1,learning_rate=0.01)]

zipped_clf = zip(names,classifiers)

tvec = TfidfVectorizer()
def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
    if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
        null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
    else:
        null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
    t0 = time()
    model = pipeline.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, labels = np.unique(y_pred))
    return accuracy, f1, train_test_time

def classifier_comparator(vectorizer=tvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([('vectorizer', vectorizer),
                                     ('classifier', c)])
        clf_accuracy, f1_score, tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_test, y_test)
        result.append((n, clf_accuracy, f1_score, tt_time))
    return result

trigram_result = classifier_comparator(n_features=20000 ,ngram_range=(1,3))

trigram_result

  if diff:


[('Logistic Regression',
  0.9851190476190477,
  0.9572649572649574,
  3.97100567817688),
 ('Linear SVC', 0.9952380952380953, 0.9868421052631579, 4.33204197883606),
 ('LinearSVC with L1-based feature selection',
  0.9922619047619048,
  0.9787928221859706,
  4.464842796325684),
 ('Multinomial NB', 0.8380952380952381, 0.6902050113895215, 5.152879476547241),
 ('Bernoulli NB', 0.6255952380952381, 0.49151172190784154, 4.261931896209717),
 ('Ridge Classifier',
  0.993452380952381,
  0.9818780889621087,
  3.892655849456787),
 ('AdaBoost', 0.9875, 0.9657422512234911, 7.192623615264893),
 ('Nearest Centroid',
  0.9607142857142857,
  0.9011976047904192,
  3.5465147495269775),
 ('XGBoost', 0.9827380952380952, 0.9528455284552845, 15.359398126602173)]

# Pipeline: TF-IDF -> Logistic Regression

In [26]:
pipeline = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', LogisticRegression())])
model = pipeline.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
f1

0.9486301369863014