In [1]:
import argparse
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import multiprocessing
import pickle

In [27]:
class WordsFeature:
    def __init__(self, df, label_encoder):
        super().__init__()
        self.df = df
        self.label_encoder = label_encoder
        # self.train_le()

    def get_next_word(self, index, orignal=False):
        """
        Given an index of a word, returns the next word from the dataframe
        params:
            index: int
        returns:
            word: str
        """
        try:
            if orignal:
                return self.df.iloc[index + 1][1]
            return self.label_encoder.transform([self.df.iloc[index + 1][1]])[0]
        except IndexError:
            if orignal:
                return "<END>"
            return self.label_encoder.transform(["<END>"])[0]
        except ValueError:
            # Returning -1 for unseen words
            return -1

    def get_prev_word(self, index, orignal=False):
        """
        Given an index of a word, returns the word before '.' from the dataframe
        params:
            index: int
        returns:
            word: str
        """
        try:

            word = self.df.iloc[index][1]
            if word[-1] == ".":
                if orignal:
                    return word[:-1]
                return self.label_encoder.transform([word[:-1]])[0]
            else:
                # NOT A PERIOD
                # I think it would be better to return a <NAP> token
                # This might also help in cleaning the data
                # If orignal is true return word as is...
                if orignal:
                    return word
#                 return self.label_encoder.transform([word])[0]
                return self.label_encoder.transform(["<NAP>"])[0]
        except ValueError:
            # Returning -1 for unseen words
            return -1
        except IndexError:
            if orignal:
                return "<START>"
            return self.label_encoder.transform(["<START>"])[0]

    def lt_3(self, index):
        """
        Given an index of a word, returns True if length of word before '.' is < 3 in the dataframe
        params:
            index: int
        returns:
            word: str
        """
        word = self.get_prev_word(index, orignal=True)
        return len(word) < 3

    def is_cap_word(self, word):
        """
        Given an index of a word, returns True if the word is capitalized in the dataframe
        params:
            index: int (index of word)
            i: int
        returns:
            is_capital: bool
        """
        try:
            return word[0].isupper()
        except:
            return False

    def actual_len(self, index):
        """
        Returns the actual length of the previous word
        """
        word = self.get_prev_word(index, orignal=True)
        return len(word)
    
    def gt_x(self, index,x=4):
        """
        Given an index of a word, returns True if length of word before '.' is < 3 in the dataframe
        params:
            index: int
        returns:
            word: str
        """
        word = self.get_prev_word(index, orignal=True)
        return len(word) > x

    def get_average_len(self, index):
        """
        Returns the average length of the previous word and next word
        """
        prev_word = self.get_prev_word(index, orignal=True)
        next_word = self.get_next_word(index, orignal=True)
        return (len(prev_word) + len(next_word)) / 2

    def get_avg_len(self, index, window=4):
        """
        Given a window, returns the average length of the words in the window
        (before current word only)

        params:
            index: int
            window: int (default 4)
        returns:
            avg_len: float
        """
        if index < 4:
            words = [len(self.get_prev_word(i, orignal=True)) for i in range(1, index)]
        else:
            words = [
                len(self.get_prev_word(index - i, orignal=True)) for i in range(window)
            ]
        try:
            return sum(words) / len(words)
        except:
            return 0

    def get_feature(self, index, nof=5):
        """
        Given an index, return corresponding features
        params:
            index: int
        returns:
            feature: list (prev word, next word, left_word<3, left_is_cap, right_is_cap, label)
        """
        if index % 1000 == 0:
            print(index)
        feature = []
        # word to the left
        if nof > 0:
            feature.append(self.get_prev_word(index))
        # word to the right
        if nof > 1:
            feature.append(self.get_next_word(index))
        # length of word to the left > 3
        if nof > 2:
            feature.append(self.lt_3(index))
        # Left word is capitalized
        if nof > 3:
            feature.append(self.is_cap_word(self.get_prev_word(index, orignal=True)))
        # Right word is capitalized
        if nof > 4:
            feature.append(self.is_cap_word(self.get_next_word(index, orignal=True)))

        # More three features
        if nof > 5:
#             feature.append(self.gt_x(index,x=4))
#             feature.append(self.gt_x(index,x=5))
#             feature.append(self.gt_x(index,x=6))
#             feature.append(self.gt_x(index,x=7))
            feature.append(self.get_avg_len(index))
        if nof > 6:
            feature.append(self.get_average_len(index))
        if nof > 7:
            feature.append(self.actual_len(index))

        # Finally add label token
        feature.append(self.df[2][index])
        return feature

    def train_le(self):
        """
        It trains label encoder with the appropriate data.
        """

        lisa = [self.get_prev_word(i, orignal=True) for i in range(len(self.df))]
        lisb = [self.get_next_word(i, orignal=True) for i in range(len(self.df))]
        lis = lisa + lisb
        lis.append("<NAP>")
        lis.append("<START>")
        return self.label_encoder.fit(lis)

In [28]:
# parser = argparse.ArgumentParser(description="Sentence Boundary Detection")
# parser.add_argument("train", help="Training file")
# parser.add_argument("test", help="Test file")

# args = parser.parse_args()

train_file = "SBD.train"
test_file = "SBD.test"
# train_file = args.train
# test_file = args.test

# Check if train file and test file exists
# if not os.path.isfile(train_file) or os.path.isfile(test_file):
#     print("Training or Testing file does not exist")
#     sys.exit(1)

# Read the training file
f = pd.read_csv(train_file, sep=r"\s", header=None)
f = f.drop(0, axis=1)

# Resultant file
#        1    2
# 0     On  TOK
# 1   June  TOK
# 2      4  TOK
# 3      ,  TOK
# 4  after  TOK

wf = WordsFeature(f, label_encoder=LabelEncoder())
le = wf.train_le()
nf = f[1].str.extract(r'([a-zA-Z0-9"]+)([.]+)')

nf.dropna(inplace=True)

ilst = list(nf[0].index)
pool = multiprocessing.Pool()
pool = multiprocessing.Pool(processes=4)
lst = pool.map(wf.get_feature, ilst)
pool.close()




37000
20000
45000
12000
70000
148000
131000
209000
173000
193000
160000
200000
263000
266000
278000


In [29]:
# We now have list containing features. We will now convert this to pandas dataframe.

# Two methods to move forward, either simply discard TOK and move with just EOS and NEOS
# or we can keep TOK and move with EOS and NEOS
# We will use the second method, if it works then good else we will try the first method.

# Try 1
# keep features only
x = [lst[i][:-1] for i in range(len(lst))]
y = [lst[i][-1] for i in range(len(lst))]

clf = tree.DecisionTreeClassifier()
clf = clf.fit(x, y)
features = 5
# We can save dtree as pickle...
with open("dtree{}.pkl".format(features), "wb") as dfile:
    pickle.dump(clf, dfile)


In [30]:
# Test phase

td = pd.read_csv(test_file, sep=r"\s", header=None)
td = td.drop(0, axis=1)
twf = WordsFeature(td, le)
tnf = td[1].str.extract(r'([a-zA-Z0-9"]+)([.]+)')
tnf.dropna(inplace=True)
tilst = list(tnf.index)
tpool = multiprocessing.Pool()
tpool = multiprocessing.Pool(processes=4)
tlst = tpool.map(twf.get_feature, tilst)
tpool.close()

# Splitting into training and testing data...
xt = [tlst[i][:-1] for i in range(len(tlst))]
yt = [tlst[i][-1] for i in range(len(tlst))]

preds = clf.predict(xt)

print(accuracy_score(yt, preds))

  This is separate from the ipykernel package so we can avoid doing imports until


17000
51000
46000
91000
139000
140000
143000
116000
0.8721399730820996


In [31]:
# 5 features: 87.81%

In [32]:
# More features: 87.21% Actually reduces accuracy...

In [33]:
# Removed NAP tokens... All features... I think this might increase accuracy as it will become aware of the word.
# Wrong hypothesis.
# It actually increases the accuracy for some reason?...
# I think label encoder is causing problem.

In [34]:
# Using get_avg_len and gt_x with x=6
# It might increase the accuracy
# Also I think the accuracy is reduced due to 3 class classification instead of 2...
# 87.31%