In [None]:
#add nchar greater than 40

import spacy
import os
import re
import pandas as pd

nlp = spacy.load('en_core_web_sm')

def validate_data(df, col):
    df = df[df[col].str.match(r'hon\.$') == False]
    df = df[df[col].str.match(r'^(?![A-Z])') == False]
    df['num_words'] = df[col].str.split().str.len()
    df = df[df['num_words'] > 4]    
    return df

def syntax_check(doc):
    sent_pos = ' '.join([token.pos_ for token in doc])
    if 'VERB' in sent_pos and 'NOUN' in sent_pos or 'PRON':
        return 'valid'
    else:
        return 'invalid'

def validate_syntax(df):
    df['syntax_check'] = df['parsed_text'].apply(syntax_check)
    df = df[df['syntax_check'] == 'valid']
    del df['syntax_check']
    return df

def sentence_check(doc):
    if re.match(r'^Which (.*)\?$|^What (.*)\?$|^Why (.*)\?$|^Where (.*)\?$|^When (.*)\?$', str(doc), re.IGNORECASE):
        if doc[1].pos_ == 'NOUN':
            return 'interrogative_sent'
    #elif re.match(r' and,| but,| or,', str(doc)):
    elif str(doc).count(',') > 0:
        return 'comp_sent'
    elif doc[0].pos_ != 'NOUN':
        if doc[0].pos_ != 'PRON':
            if doc[1].pos_ == 'VERB': 
                return 'leftward_sent'

def tag_sentence(df):  
    df['tag'] = df['parsed_text'].apply(sentence_check)
    del df['parsed_text']
    return df

def collect_sentences(df, col):
    df = pd.read_csv(df)
    df = df[[col]].copy()
        
    df = validate_data(df, col)
    
    df['parsed_text'] = list(nlp.pipe(df[col], disable = ['ent']))
    
    df = validate_syntax(df)
    df = tag_sentence(df)

    export_dir = '/users/sbuongiorno/sentence_eval'
    if not os.path.exists(export_dir):
        os.makedirs(export_dir)
        
    types = ['leftward_sent', 'interrogative_sent', 'comp_sent']
                
    for t in types:
        out = df[df['tag'] == t]
        out = out.sample(1500)
        out.to_csv(export_dir + '/' + t + '.csv')
            
    df = df.sample(n = 1500)
    df.to_csv(export_dir + '/' + 'random_sent.csv')
    
collect_sentences('/scratch/group/pract-txt-mine/hansard_justnine_12192019.csv', 'text')

In [None]:
#add nchar greater than 40

import spacy
import re
import pandas as pd

nlp = spacy.load('en_core_web_sm')

def validate_data(df):
    df = df[~df['text'].str.match(r'hon\.$')]
    df = df[~df['text'].str.match(r'^(?![A-Z])')]
    return df

def validate_triples_structure(doc):
    for token in doc:
        if 'VERB' and 'NOUN' in token.pos_:
            return doc

def leftward_sent(doc): # extract leftward sentences without wh- movement 
    if len(doc) > 20:
        #return doc if doc[1].pos_ == 'VERB' else 'NaN'
        if doc[0].pos_ != 'NOUN' and doc[1].pos_ == 'VERB': 
            return 'leftward_sent'

def interrogative_sent(doc): # extract interrogative sentences (with wh- movement)
    if len(doc) > 20:
        #return doc if doc.match(r'^Which (.*)\?$|^What (.*)\?$', re.IGNORECASE) else 'NaN'
        if doc.match(r'^Which (.*)\?$|^What (.*)\?$', re.IGNORECASE):
            return 'interrogative_sent'

def comp_sent(doc): # extract compound, complex, compound-complex sentences
    if len(doc) > 40:
        #return doc if doc.match(r' and,| but,| or,') else 'NaN'
        if doc.match(r' and,| but,| or,'):
            return 'comp_sent'

def collect_sentences(df, col):
    df = pd.read_csv(df)
    df = df[df['text']].copy()


collect_sentences('~/hansard.csv', 'text')

hansard = pd.read_csv('~/hansard.csv')
hansard = hansard[hansard['text']].copy()

hansard = validate_data(hansard)

hansard['text'] = list(nlp.pipe(hansard['text'], disable = ['ent']))

hansard['tag'] = hansard['text'].apply(leftward_sent)
hansard['tag'] = hansard['text'].apply(interrogative_sent)
hansard['tag'] = hansard['text'].apply(comp_sent)

types = ['leftward_sent', 'interrogative_sent', 'comp_sent']
out = pd.DataFrame()
for type in types:
    out = hansard[hansard['tag'] == type].sample(1000)
    out.to_csv('/users/sbuongiornoz' + type + '.csv')

hansard = hansard.sample(n = 1000)
hansard.to_csv('~/random_sent.csv')