In [1]:
import os
import json
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import LabelEncoder
import time
import nltk.classify
from sklearn.svm import LinearSVC
import re
import csv
import pandas as pd
import numpy as np
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from collections import defaultdict
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords as stop_words

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shara\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\shara\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
np.random.seed(500)

In [3]:
training_folder = 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles/'

In [4]:
test_folder = 'C:/Users/shara/Documents/SentenceCorpus/unlabeled_articles/'

In [5]:
def list_files_from_directory(directory):
    """Lists all file paths from given directory"""

    ret_val = []
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            ret_val.append(str(directory) + "/" + str(file))
    return ret_val

In [6]:
train_file_paths = list_files_from_directory(training_folder)

In [7]:
print(train_file_paths[0:10])

['C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate10_7_1.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate10_7_2.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate10_7_3.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate1_13_1.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate1_13_2.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate1_13_3.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate2_66_1.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate2_66_2.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate2_66_3.txt', 'C:/Users/shara/Documents/SentenceCorpus/labeled_articles//arxiv_annotate3_80_1.txt']


In [8]:
def read_file(path):
    """Reads all lines from file on given path"""

    f = open(path, "r", errors='ignore')
    read = f.readlines()
    ret_val = []
    for line in read:
        if line.startswith("#"):
            pass
        else:
            ret_val.append(line)
    return ret_val

In [9]:
read_file('C:/Users/shara/Documents/SentenceCorpus/labeled_articles/arxiv_annotate1_13_1.txt')

['MISC\talthough the internet as level topology has been extensively studied over the past few years  little is known about the details of the as taxonomy\n',
 'MISC\tan as  node  can represent a wide variety of organizations  e g   large isp  or small private business  university  with vastly different network characteristics  external connectivity patterns  network growth tendencies  and other properties that we can hardly neglect while working on veracious internet representations in simulation environments\n',
 'AIMX\tin this paper  we introduce a radically new approach based on machine learning techniques to map all the ases in the internet into a natural as taxonomy\n',
 'OWNX\twe successfully classify  NUMBER   NUMBER  percent  of ases with expected accuracy of  NUMBER   NUMBER  percent \n',
 'OWNX\twe release to the community the as level topology dataset augmented with   NUMBER   the as taxonomy information and  NUMBER   the set of as attributes we used to classify ases\n',
 '

In [10]:
read_file('C:/Users/shara/Documents/SentenceCorpus/unlabeled_articles/arxiv_unlabeled/1.txt')

['Fitness functions based on test cases are very common in Genetic Programming (GP)\n',
 'This process can be assimilated to a learning task, with the inference of models from a limited number of samples\n',
 'This paper is an investigation on two methods to improve generalization in GP-based learning: 1) the selection of the best-of-run individuals using a three data sets methodology, and 2) the application of parsimony pressure in order to reduce the complexity of the solutions\n',
 'Results using GP in a binary classification setup show that while the accuracy on the test sets is preserved, with less variances compared to baseline results, the mean tree size obtained with the tested methods is significantly reduced\n',
 'GP is particularly suited for problems that can be assimilated to learning tasks, with the minimization of the error between the obtained and desired outputs for a limited number of test cases -- the training data, using a ML terminology\n',
 'Indeed, the classical 

In [11]:
# loading stopwords
input_stopwords = read_file("C:/Users/shara/Documents/SentenceCorpus/word_lists/stopwords.txt")
stopwords = []
for word in input_stopwords:
    if word.endswith('\n'):
        word = word[:-1]
        stopwords.append(word)

In [12]:
def remove_stopwords(text):
    tokens = word_tokenize(text)
    text = " ".join(word for word in tokens if word not in stopwords)
    return text

In [13]:
def process_line(line):
    """Returns sentence category and sentence in given line"""

    if "\t" in line:
        splits = line.split("\t")
        s_category = splits[0]
        sentence = splits[1].lower()
        sentence = remove_stopwords(sentence)
        pattern = re.compile("[^\w']")
        sentence = pattern.sub(' ', sentence)
        sentence = re.sub(' +', ' ', sentence)
        return s_category, sentence
    else:
        splits = line.split(" ")
        s_category = splits[0]
        sentence = line[len(s_category)+1:].lower()
        sentence = remove_stopwords(sentence)
        pattern = re.compile("[^\w']")
        sentence = pattern.sub(' ', sentence)
        sentence = re.sub(' +', ' ', sentence)
        return s_category, sentence

In [14]:
def remove_stopwords(text):
    tokens = word_tokenize(text)
    text = " ".join(word for word in tokens if word not in stopwords)
    return text

In [15]:
def test_files_from_directory(directory):
    """Lists all file paths from given directory"""

    ret_file = []
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            ret_file.append(file)
    return ret_file

In [16]:
test_folder = 'C:/Users/shara/Documents/SentenceCorpus/unlabeled_articles/'

def test_data(folder):
    text_data = []
    ret_val = []
    files_dir = []
    for file in os.listdir(test_folder):
        ret_val.append(file)

    for folder in ret_val:
        directory = str(test_folder) + str(folder) + '/'
        try:
            file_list = test_files_from_directory(str(directory))
            for file in file_list:
                file_path = str(directory) + str(file)
                files_dir.append(file_path)
        except:
            pass
    for path in files_dir:
        lines = read_file(path)
        for text in lines:
            text = text.lower()
            text = remove_stopwords(text)
            pattern = re.compile("[^\w']")
            text = pattern.sub(' ', text)
            text = re.sub(' +', ' ', text)
            text_data.append(text)
    text_data = pd.DataFrame(text_data, columns=['text'])
    return text_data

In [17]:
test_df = test_data(test_folder)

In [18]:
test_df.head()

Unnamed: 0,text
0,fitness functions based test cases very common...
1,this process assimilated learning task inferen...
2,this paper investigation two methods improve g...
3,results using gp binary classification setup s...
4,gp particularly suited problems assimilated le...


In [19]:
def create_df(input_folder):
    """Writes training data from given folder into formatted CSV file"""

    tr_folder = list_files_from_directory(input_folder)
    #     all_json = ''
    training_dict = {}
    label = []
    text = []
    for file in tr_folder:
        lines = read_file(file)
        for line in lines:
            c, s = process_line(line)
            if s.endswith('\n'):
                s = s[:-1]
            label.append(c)
            text.append(s)
        training_dict['text'] = text
        training_dict['label'] = label
    df = pd.DataFrame(training_dict)
    return df

In [20]:
training_df = create_df('C:/Users/shara/Documents/SentenceCorpus/labeled_articles')

In [21]:
training_df.head()

Unnamed: 0,text,label
0,minimum description length principle online se...,MISC
1,if underlying model class discrete then total ...,MISC
2,mdl general one only loss bounds finite but ex...,MISC
3,we show this even case if model class contains...,AIMX
4,we derive new upper bound prediction error cou...,OWNX


In [22]:
training_df['label'].value_counts()

MISC                   1807
OWNX                    861
AIMX                    190
CONT                    168
BASE                     61
MISC--the                 6
MISC--in                  4
AIMX--on                  4
OWNX--after               2
OWNX                      2
MISC--for                 2
CONT--these               2
MISC--on                  2
OWNX--we                  2
MISC--several             2
MISC--specifically,       2
Name: label, dtype: int64

## Approach 1

In [23]:
copy_1 = training_df.copy()

In [24]:
copy_1.head()

Unnamed: 0,text,label
0,minimum description length principle online se...,MISC
1,if underlying model class discrete then total ...,MISC
2,mdl general one only loss bounds finite but ex...,MISC
3,we show this even case if model class contains...,AIMX
4,we derive new upper bound prediction error cou...,OWNX


In [25]:
copy_1['text'] = [str(word_tokenize(entry)) for entry in copy_1['text']]

In [26]:
print(copy_1['text'].head())

0    ['minimum', 'description', 'length', 'principl...
1    ['if', 'underlying', 'model', 'class', 'discre...
2    ['mdl', 'general', 'one', 'only', 'loss', 'bou...
3    ['we', 'show', 'this', 'even', 'case', 'if', '...
4    ['we', 'derive', 'new', 'upper', 'bound', 'pre...
Name: text, dtype: object


In [27]:
Train_X_1, Test_X_1, Train_Y_1, Test_Y_1 = model_selection.train_test_split(copy_1['text'],copy_1['label'],test_size=0.3)

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [29]:
# To make the vectorizer => transformer => classifier easier to work with, 
# we will use Pipeline class in Scilkit-Learn that behaves like a compound classifier

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB())])

In [30]:
nb.fit(Train_X_1, Train_Y_1)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [31]:
from sklearn.metrics import classification_report
y_pred = nb.predict(Test_X_1)

In [32]:
print('accuracy %s' % accuracy_score(y_pred, Test_Y_1))

accuracy 0.7628205128205128


## Approach 2

In [33]:
from sklearn.linear_model import SGDClassifier
import logging

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])

In [34]:
sgd.fit(Train_X_1, Train_Y_1)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [35]:
y_pred = sgd.predict(Test_X_1)

In [36]:
print('accuracy %s' % accuracy_score(y_pred, Test_Y_1))

accuracy 0.8087606837606838
