In [590]:
import pandas as pd
import numpy as np
import os
import spacy
from spacy.tokens.doc import Doc
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import random
import json

In [591]:
pd.set_option('display.max_colwidth', 500)

# Dataset

In [592]:
def concatenate_list_data(lst):
    result= ''
    for element in lst:
        if bool(random.getrandbits(1)):   
            element = element[0].lower() + element[1:]
        result += ' ' + str(element)
    return result.strip()


In [593]:
def getData(data_path):
    
    df_data = pd.read_csv(data_path, sep='\t', names=['col1', 'col2', 'sentence'])        
    df_data['sentence'] = df_data['sentence'].str.replace('`', '\'')\
                            .str.replace('\'\'', '"')
    df_data = df_data[df_data['sentence'].map(lambda x: len(str(x).split())) > 3]
    df_data = df_data.groupby(['col1', 'col2']).agg({'sentence': concatenate_list_data}).reset_index()
    return df_data.drop(['col1', 'col2'], axis=1)

In [594]:
data_path = 'data/simple.txt'
df = getData(data_path)

In [305]:
# df_stat = df['col2'].value_counts().reset_index().drop('index', axis=1)
# df_stat['%col2'] = 100*df_stat['col2']/df_stat['col2'].sum()
# df_stat

In [595]:
print(len(df))
df.head()

163658


Unnamed: 0,sentence
0,"03 Bonnie & Clyde is a single by rapper Jay-Z featuring R&B singer BeyoncÃ . it was taken from his ninth studio album The BlueprintÂ : The Gift & the Curse and can also be found on the international editions of her solo debut album Dangerously in Love . released in 2002 , the song reached number four on the U.S. Billboard Hot 100 , becoming Jay-Z 's second top ten single and BeyoncÃ 's first top ten single as a solo artist ; it also reached number two in the UK . the single is notable for ca..."
1,"The music video for the song , directed by Chris Robinson , was nominated for "" Best Hip-Hop Video "" at the 2003 MTV Video Music Awards , but it lost out to Missy Elliott 's "" Work It . "" the name of the song is a reference to Eminem 's song "" '97 Bonnie and Clyde "" , and by extension the original Tupac song , in which he refers to himself and his girlfriend as the "" '96 Bonnie and Clyde "" ."
2,"There is an edited version of it , entitled "" Bonnie & Clyde ' 03 "" which features many of BeyoncÃ 's vocals rather than of Jay-Z . it is included in her "" Dangerously In Love "" CD ."
3,"' N Sync , sometimes called \* NSYNC was an American pop boy band formed in Orlando , Florida in 1995 . the five members of the group were Lance Bass , JC Chasez , Joey Fatone , Chris Kirkpatrick , and Justin Timberlake . Since the band broke up , Timberlake and Chasez have released solo albums ."
4,' N Sync 's album No Strings Attached holds the record for the most copies of an album sold in its first week after being released . The album sold 1.1 million copies its first day and 2.4 million in the first week .


# Get features

In [596]:
class WordTokenizer(object):
    """
    Custom Tokenizer
    """
    def __init__(self, vocab=nlp.vocab, tokenizer=None, return_doc=True):
        self.vocab = vocab
        self._word_tokenizer = tokenizer
        self.return_doc = return_doc

    def __call__(self, text):
        if self._word_tokenizer:
            words = self._word_tokenizer.tokenize(text)
        else:
            words = text.split(' ')
        if self.return_doc:
            spaces = [True] * len(words)
            return Doc(self.vocab, words=words, spaces=spaces)
        else:
            return words

In [602]:
columns= [ 
            'previous_lemma', 'previous_pos', 'previous_ent_iob',
           'previous_is_alpha', 'previous_is_digit', 'previous_is_lower',
           'previous_is_upper', 'previous_is_title', 'previous_is_punct',
           'previous_is_left_punct', 'previous_is_right_punct',
           'previous_is_bracket', 'previous_dep', 
            'lemma', 'pos', 'ent_iob', 'is_alpha', 'is_digit', 'is_lower',
           'is_upper', 'is_title', 'is_punct', 'is_left_punct', 'is_right_punct',
           'is_bracket', 'dep', 'next_lemma', 'next_pos', 'next_ent_iob',
           'next_is_alpha', 'next_is_digit', 'next_is_lower', 'next_is_upper',
           'next_is_title', 'next_is_punct', 'next_is_left_punct',
           'next_is_right_punct', 'next_is_bracket', 'next_dep'
]

In [606]:
def get_features_from_sent(sentence, get_labels=True):
    data = {}
    doc = nlp('s_start_s ' + sentence + ' e_end_e')
        # Lables
    if get_labels: 
        labels = []
        doc_1 = []
        labels_with_punc = [str(i) in '.?!' for i in doc[1:-2]] + [False, False, False] 
        for i, j in zip(doc, labels_with_punc):
            if not(str(i) in '.?!'):
                labels.append(j)
                doc_1.append(i)
        doc = doc_1
        
    feature_list = []
    for i in range(1, len(doc)-1):
        prev_word = doc[i-1]
        curr_word = doc[i]
        next_word = doc[i+1]
        
        feature_list.append([
            prev_word.lemma_, 
            prev_word.pos_,
            prev_word.ent_iob,
            prev_word.is_alpha,
            prev_word.is_digit,
            prev_word.is_lower,
            prev_word.is_upper,
            prev_word.is_title,
            prev_word.is_punct,
            prev_word.is_left_punct,
            prev_word.is_right_punct,
            prev_word.is_bracket,
            prev_word.dep_,
            
            curr_word.lemma_, 
            curr_word.pos_,
            curr_word.ent_iob,
            curr_word.is_alpha,
            curr_word.is_digit,
            curr_word.is_lower,
            curr_word.is_upper,
            curr_word.is_title,
            curr_word.is_punct,
            curr_word.is_left_punct,
            curr_word.is_right_punct,
            curr_word.is_bracket,
            curr_word.dep_,
            
            next_word.lemma_, 
            next_word.pos_,
            next_word.ent_iob,
            next_word.is_alpha,
            next_word.is_digit,
            next_word.is_lower,
            next_word.is_upper,
            next_word.is_title,
            next_word.is_punct,
            next_word.is_left_punct,
            next_word.is_right_punct,
            next_word.is_bracket,
            next_word.dep_,   
        ])
            

    

    df = pd.DataFrame(feature_list, columns=columns)
    if get_labels:
            df['labels'] = labels[1:-1]
        
    return df


In [None]:
# Переписала
# def get_features_from_sent(sentence, get_labels=True):
#     data = {}
#     doc = nlp('s_start_s ' + sentence + ' e_end_e')
#         # Lables
#     if get_labels: 
#         labels = []
#         doc_1 = []
#         labels_with_punc = [str(i) in '.?!' for i in doc[1:-2]] + [False, False, False] 
#         for i, j in zip(doc, labels_with_punc):
#             if not(str(i) in '.?!'):
#                 labels.append(j)
#                 doc_1.append(i)
#         doc = doc_1
#     d_curr = doc[1:-1]
#     d_next = doc[2:]
#     data = {
#             # The current token
# #             'token' : [i.text for i in doc[:-1]],
#             'lemma' : [i.lemma_ for i in d_curr],
#             'pos' : [i.pos_ for i in d_curr],
#             'ent_iob' : [i.ent_iob for i in d_curr],
#             'is_alpha' : [i.is_alpha for i in d_curr],
#             'is_digit' : [i.is_digit for i in d_curr],
#             'is_lower' : [i.is_lower for i in d_curr],
#             'is_upper' : [i.is_upper for i in d_curr],
#             'is_title' : [i.is_title for i in d_curr],
#             'is_punct' : [i.is_punct for i in d_curr],
#             'is_left_punct' : [i.is_left_punct for i in d_curr],
#             'is_right_punct' : [i.is_right_punct for i in d_curr],
#             'is_bracket' : [i.is_bracket for i in d_curr],
#             'dep' : [i.dep_ for i in d_curr],  
#             # The next token
# #             'next_token' : [i.text for i in doc[1:]],
#             'next_lemma' : [i.lemma_ for i in d_next],
#             'next_pos' : [i.pos_ for i in d_next],
#             'next_ent_iob' : [i.ent_iob for i in d_next],
#             'next_is_alpha' : [i.is_alpha for i in d_next],
#             'next_is_digit' : [i.is_digit for i in d_next],
#             'next_is_lower' : [i.is_lower for i in d_next],
#             'next_is_upper' : [i.is_upper for i in d_next],
#             'next_is_title' : [i.is_title for i in d_next],
#             'next_is_punct' : [i.is_punct for i in d_next],
#             'next_is_left_punct' : [i.is_left_punct for i in d_next],
#             'next_is_right_punct' : [i.is_right_punct for i in d_next],
#             'next_is_bracket' : [i.is_bracket for i in d_next],
#             'next_dep' : [i.dep_ for i in d_next],         
#             # The previous token 
# #             'previous_token' :  [i.text for i in doc[:-2]],
#             'previous_lemma' :  [i.lemma_ for i in doc[:-2]],
#             'previous_pos' :  [i.pos_ for i in doc[:-2]],
#             'previous_ent_iob' : [i.ent_iob for i in doc[:-2]],
#             'previous_is_alpha' :  [i.is_alpha for i in doc[:-2]],
#             'previous_is_digit' : [i.is_digit for i in doc[:-2]],
#             'previous_is_lower' :  [i.is_lower for i in doc[:-2]],
#             'previous_is_upper' :  [i.is_upper for i in doc[:-2]],
#             'previous_is_title' :  [i.is_title for i in doc[:-2]],
#             'previous_is_punct' :  [i.is_punct for i in doc[:-2]],
#             'previous_is_left_punct' :  [i.is_left_punct for i in doc[:-2]],
#             'previous_is_right_punct' :  [i.is_right_punct for i in doc[:-2]],
#             'previous_is_bracket' :  [i.is_bracket for i in doc[:-2]],
#             'previous_dep' : [i.dep_ for i in doc[:-2]], 
#     }    
#     if get_labels:
#         data['labels'] = labels[1:-1]
    
#     df = pd.DataFrame.from_dict(data)
#     return df



In [604]:
nlp = spacy.load("en")
nlp.tokenizer = WordTokenizer(nlp.vocab)

In [608]:
df[:4]['sentence'].map(lambda x: get_features_from_sent(str(x), get_labels=True))[2]

Unnamed: 0,previous_lemma,previous_pos,previous_ent_iob,previous_is_alpha,previous_is_digit,previous_is_lower,previous_is_upper,previous_is_title,previous_is_punct,previous_is_left_punct,...,next_is_digit,next_is_lower,next_is_upper,next_is_title,next_is_punct,next_is_left_punct,next_is_right_punct,next_is_bracket,next_dep,labels
0,s_start_s,VERB,2,False,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,ROOT,False
1,there,ADV,2,True,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,det,False
2,be,VERB,2,True,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,amod,False
3,an,DET,2,True,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,attr,False
4,edit,VERB,2,True,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,prep,False
5,version,NOUN,2,True,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,pobj,False
6,of,ADP,2,True,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,punct,False
7,-PRON-,PRON,2,True,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,dep,False
8,",",PUNCT,2,False,False,False,False,False,True,False,...,False,False,False,False,True,True,True,False,punct,False
9,entitle,VERB,2,True,False,True,False,False,False,False,...,False,False,False,True,False,False,False,False,poss,False


In [609]:
train_list = []
for sent in df['sentence'][:10000]:
    train_list.append(get_features_from_sent(str(sent), get_labels=True))

In [610]:
full_df = pd.concat(train_list)

In [611]:
X = full_df.drop('labels', axis=1)
y = full_df['labels']

In [612]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [613]:
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(X_train.to_dict('records'))

# Train

In [614]:
lrc = LogisticRegression(random_state=42, solver="lbfgs", multi_class="multinomial")

In [615]:
lrc.fit(X_train, y_train.values.ravel())



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

# Test evaluation

In [616]:
X_test = vectorizer.transform(X_test.to_dict('records'))

In [617]:
predicted = lrc.predict(X_test)

In [618]:
print(classification_report(y_true=y_test, y_pred=predicted))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99    167666
        True       0.86      0.77      0.82      6854

   micro avg       0.99      0.99      0.99    174520
   macro avg       0.93      0.88      0.90    174520
weighted avg       0.99      0.99      0.99    174520



# Final Evaluation

In [619]:
with open('data/run-on-test.json') as file:
    data_eval = json.load(file)
#     data_eval = [[['<p>', False]] + i + [['</p>', False]] for i in data_eval]

In [620]:
y_eval = list(np.concatenate([[word[1] for word in sent] for sent in data_eval]))
print(len(y_test_eval))

4697


In [621]:
sent_eval = [' '.join([word[0] for word in sent]) for sent in data_eval]

In [622]:
X_eval_list = []
for sent in sent_eval:
    X_eval_list.append(get_features_from_sent(sent, get_labels=False))   

In [623]:
X_eval_df = pd.concat(X_eval_list)

In [624]:
X_eval = vectorizer.transform(X_eval_df.to_dict('records'))

In [625]:
predicted_eval = lrc.predict(X_eval)

In [626]:
print(classification_report(y_true=y_eval, y_pred=predicted_eval))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99      4542
        True       0.75      0.75      0.75       155

   micro avg       0.98      0.98      0.98      4697
   macro avg       0.87      0.87      0.87      4697
weighted avg       0.98      0.98      0.98      4697

