In [1]:
import spacy
import textstat
import syllables
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split

!python3 -m spacy download en_core_web_md

In [2]:
# load spacy english text processing pipeline

nlp = spacy.load("en_core_web_md")
df1 = pd.read_csv('WikiLarge_Train.csv')

### Spacy NLP processing

In [3]:
# First step, tokenize and lemmatize the words
# also filtered out the non-stopwords, since these words/lemmas should be much more significant to the readability
# of a sentence

tokens = []
lemmas = []
nonstop = []
nonstop_lem = []

for i in range(len(df1)):
    doc = nlp(df1['original_text'][i])

    toks = []
    lem = []
    non_stop = []
    non_stop_lem = []

    for token in doc: 
        if token.is_punct == False:
            toks.append(token.text)
            lem.append(token.lemma_)

            if token.is_stop == False:
                non_stop.append(token.text)
                non_stop_lem.append(token.lemma_)

    tokens.append(toks)
    lemmas.append(lem)
    nonstop.append(non_stop)
    nonstop_lem.append(non_stop_lem)

df1['tokens'] = pd.Series(tokens)                              # list of tokens
df1['lemmas'] = pd.Series(lemmas)                              # list of lemmas
df1['non_stop_tokens'] = pd.Series(nonstop)                    # list of non-stopword tokens
df1['non_stop_lems'] = pd.Series(nonstop_lem)                  # list of non-stopword lemmas

In [4]:
# get the alphabetic/digit/punctuation/stopword tokens count of a sentence

alphas = []
digits = []
Punc = []
Stop = []

for i in range(len(df1)):
    doc = nlp(df1['original_text'][i])

    alpha = 0
    digit = 0
    punct = 0
    stop = 0

    for token in doc: 
        if token.is_punct:
            punct += 1
        if token.is_alpha:
            alpha +=1
        if token.is_digit:
            digit += 1
        if token.is_stop:
            stop += 1

    alphas.append(alpha)
    digits.append(digit)
    Punc.append(punct)
    Stop.append(stop)

df1['alpha_count'] = pd.Series(alphas)         # alphabetic token count
df1['digit_count'] = pd.Series(digits)         # digital token count
df1['punc_count'] = pd.Series(Punc)            # punctuation token count   (in Spacy pipeline, punctuation itself is a token)
df1['stopword_count'] = pd.Series(Stop)        # stopword token count

In [5]:
# use POS tagging to get the info of pronouns, nouns, verbs, adjs of a sentence

pos_lst = ['PROPN','VERB','NOUN','ADJ']

nouns = []
prons = []
non_stop_N = []
adjs = []
non_stop_ADJ = []
vcount = []

for i in range(len(df1)):
    doc = nlp(df1['original_text'][i])

    pns = []
    nns = []
    non_stop_n = []
    adj = []
    non_stop_adj = []
    v = 0

    for token in doc: 
        if token.pos_ in pos_lst:
            if token.pos_ == 'NOUN':
                nns.append(token.text)
                if token.is_stop == False:
                    non_stop_n.append(token.text)
            if token.pos_ == 'PROPN':
                pns.append(token.text)
            if token.pos_ == 'ADJ':
                adj.append(token.text)
                if token.is_stop == False:
                    non_stop_adj.append(token.text)
            if token.pos_ == 'VERB':
                v += 1

    nouns.append(nns)
    prons.append(pns)
    non_stop_N.append(non_stop_n)
    adjs.append(adj)
    non_stop_ADJ.append(non_stop_adj)
    vcount.append(v)

df1['nouns'] = pd.Series(nouns)                                   # list of noun tokens
df1['pronouns'] = pd.Series(prons)                                # list of pronouns (might be a multi-word chunk)
df1['non_stop_nouns'] = pd.Series(non_stop_N)                     # list of non-stopword nouns
df1['adjs'] = pd.Series(adjs)                                     # list of adjectives
df1['non_stop_adj'] = pd.Series(non_stop_ADJ)                     # list of non-stopword adjectives
df1['verb_count'] = pd.Series(vcount)                             # count of verbs in the sentence

In [6]:
# Spacy pipeline is very powerful in text processing, can even get us
# the entities or noun chunks in a sentence

entity = []
entity_type = []
noun_chunk = []

for i in range(len(df1)):
    doc = nlp(df1['original_text'][i])

    ents = []
    ent_type =[]
    chunks = []

    for ent in doc.ents:
        ents.append(ent.text)
        if ent.label_ not in ent_type:
            ent_type.append(ent.label_)

    for chunk in doc.noun_chunks:
        chunks.append(chunk.text)

    entity.append(ents)
    entity_type.append(len(ent_type))
    noun_chunk.append(chunks)

df1['entities'] = pd.Series(entity)                       # list of entities
df1['entity_type'] = pd.Series(entity_type)               # how many different types of entity are in the sentence
df1['noun_chunks'] = pd.Series(noun_chunk)                # list of noun chunks


# after all those processing, let's see what the data look like now :

df1.head()

Unnamed: 0,original_text,label,tokens,lemmas,non_stop_tokens,non_stop_lems,alpha_count,digit_count,punc_count,stopword_count,nouns,pronouns,non_stop_nouns,adjs,non_stop_adj,verb_count,entities,entity_type,noun_chunks
0,There is manuscript evidence that Austen conti...,1,"[There, is, manuscript, evidence, that, Austen...","[there, be, manuscript, evidence, that, Austen...","[manuscript, evidence, Austen, continued, work...","[manuscript, evidence, Austen, continue, work,...",35,3,5,18,"[evidence, pieces, period, niece, nephew, addi...","[Austen, Anna, James, Edward, Austen]","[evidence, pieces, period, niece, nephew, addi...","[manuscript, further]",[manuscript],4,"[Austen, the period 1809 â '' 11, Anna, James ...",2,"[manuscript evidence, Austen, these pieces, th..."
1,"In a remarkable comparative analysis , Mandaea...",1,"[In, a, remarkable, comparative, analysis, Man...","[in, a, remarkable, comparative, analysis, man...","[remarkable, comparative, analysis, Mandaean, ...","[remarkable, comparative, analysis, mandaean, ...",21,0,3,7,"[analysis, scholar, texts]","[Säve, Söderberg, Mani, Psalms, Thomas]","[analysis, scholar, texts]","[remarkable, comparative, Mandaean, related, M...","[remarkable, comparative, Mandaean, related, M...",1,"[Mandaean, Säve-Söderberg, Mani, Thomas, Manda...",2,"[a remarkable comparative analysis, Mandaean s..."
2,"Before Persephone was released to Hermes , who...",1,"[Before, Persephone, was, released, to, Hermes...","[before, Persephone, be, release, to, Hermes, ...","[Persephone, released, Hermes, sent, retrieve,...","[Persephone, release, Hermes, send, retrieve, ...",40,0,4,23,"[pomegranate, seeds, telling, underworld, peri...","[Persephone, Hermes, Hades, -LRB-, -RRB-]","[pomegranate, seeds, telling, underworld, peri...",[],[],8,"[Persephone, Hermes, six, three, a period each...",4,"[Persephone, Hermes, who, her, Hades, her, pom..."
3,Cogeneration plants are commonly found in dist...,1,"[Cogeneration, plants, are, commonly, found, i...","[cogeneration, plant, be, commonly, find, in, ...","[Cogeneration, plants, commonly, found, distri...","[cogeneration, plant, commonly, find, district...",32,0,7,5,"[Cogeneration, plants, district, heating, syst...",[],"[Cogeneration, plants, district, heating, syst...","[thermal, industrial, large]","[thermal, industrial, large]",2,[],0,"[Cogeneration plants, district heating systems..."
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1,"[Geneva, -LRB-, -RRB-, is, the, second, most, ...","[Geneva, -LRB-, -RRB-, be, the, second, most, ...","[Geneva, -LRB-, -RRB-, second, populous, city,...","[Geneva, -LRB-, -RRB-, second, populous, city,...",25,0,11,13,"[city, city, part]","[Geneva, -LRB-, -RRB-, Switzerland, -LRB-, Zür...","[city, city]","[populous, populous]","[populous, populous]",1,"[Geneva -LRB-, second, Switzerland, Zürich, Fr...",4,"[-RRB-, the second-most-populous city, Switzer..."


### Self-defined features

Start by defining some functions that can be useful.

In [7]:
def alpha(text):
    count = 0
    for w in text: 
        if w.isalpha():
            count += 1
    return count
    # return the alphabetic part length of a string

def num(text):
    count = 0
    for w in text: 
        if w.isnumeric():
            count += 1
    return count
    # return the numeric part length of a string


def unique(list_):
    result = len(set(list_))
    return result
    # return the count of unique values within a list

def count_sylla(list_):
    result = []
    
    if len(list_) == 0:
        return result
    else: 
        for word in list_:
            result.append(syllables.estimate(word))
        
        return result
        # return the syllables estimates of a token list as another list

In [8]:
def avg_sylla(list_):
    return np.mean(count_sylla(list_))
    # return the average syllables of a list (returned from the above function)

def count_sylla_over(list_,thres):
    result = 0
    
    if len(list_) == 0:
        return result
    else: 
        for word in list_:
            if syllables.estimate(word) >= thres:
                result += 1
        
        return result
        # return the count of tokens that have a syllable count equal or above the threshold given
        
def total_sylla(list_): 
    result = 0
    
    for w in list_:
        result += syllables.estimate(w)
        
    return result
    # return the total syllable count of a list of tokens

In [9]:
df1['raw_length'] = df1['original_text'].apply(len)
df1['token_count'] = df1['tokens'].apply(len)
df1['token_count_raw'] = df1['token_count'] + df1['punc_count'] 

df1['alpha_ratio'] = df1['alpha_count'] / df1['token_count']
df1['digit_ratio'] = df1['digit_count'] / df1['token_count']
df1['punc_ratio'] = df1['punc_count'] / df1['token_count_raw']
df1['stop_ratio'] = df1['stopword_count'] / df1['token_count']

df1['nonstop_count'] = df1['non_stop_tokens'].apply(len)                      # count of non-stopword tokens
df1['nonstop_ratio'] = df1['nonstop_count'] / df1['token_count']
df1['nonstop_lem_count'] = df1['non_stop_lems'].apply(len)                    # count of non-stopword lemmas
df1['nonstop_lem_ratio'] = df1['nonstop_lem_count'] / df1['token_count']

df1['unique_token_count'] = df1['tokens'].apply(unique)
df1['unique_lemma_count'] = df1['lemmas'].apply(unique)
df1['unique_nonstop_token'] = df1['non_stop_tokens'].apply(unique)
df1['unique_nonstop_lem'] = df1['non_stop_lems'].apply(unique)

df1['unique_word_ratio'] = df1['unique_token_count'] / df1['token_count']
df1['unique_lemma_ratio'] = df1['unique_lemma_count'] / df1['token_count']
df1['unique_nonstop_ratio'] = df1['unique_nonstop_token'] / df1['token_count']
df1['unique_nonstoplem_ratio'] = df1['unique_nonstop_lem'] / df1['token_count']

df1['noun_count'] = df1['nouns'].apply(len)
df1['pronoun_count'] = df1['pronouns'].apply(len)
df1['nonstop_noun_count'] = df1['non_stop_nouns'].apply(len)
df1['nonstop_adj_count'] = df1['non_stop_adj'].apply(len)
df1['entity_count'] = df1['entities'].apply(len)
df1['nounchunk_count'] = df1['noun_chunks'].apply(len)

df1['noun_ratio'] = df1['noun_count'] / df1['token_count']
df1['pronoun_ratio'] = df1['pronoun_count'] / df1['token_count']
df1['nonstop_noun_ratio'] = df1['nonstop_noun_count'] / df1['token_count']
df1['nonstop_adj_ratio'] = df1['nonstop_adj_count'] / df1['token_count']
df1['nounchunk_ratio'] = df1['nounchunk_count'] / df1['token_count']

df1['token_avg_sylla'] = df1['tokens'].apply(avg_sylla)
df1['lemma_avg_sylla'] = df1['lemmas'].apply(avg_sylla)
df1['nonstop_avg_sylla'] = df1['non_stop_tokens'].apply(avg_sylla)
df1['adj_avg_sylla'] = df1['adjs'].apply(avg_sylla)
df1['entity_avg_sylla'] = df1['entities'].apply(avg_sylla)

df1.head(3)

Unnamed: 0,original_text,label,tokens,lemmas,non_stop_tokens,non_stop_lems,alpha_count,digit_count,punc_count,stopword_count,...,noun_ratio,pronoun_ratio,nonstop_noun_ratio,nonstop_adj_ratio,nounchunk_ratio,token_avg_sylla,lemma_avg_sylla,nonstop_avg_sylla,adj_avg_sylla,entity_avg_sylla
0,There is manuscript evidence that Austen conti...,1,"[There, is, manuscript, evidence, that, Austen...","[there, be, manuscript, evidence, that, Austen...","[manuscript, evidence, Austen, continued, work...","[manuscript, evidence, Austen, continue, work,...",35,3,5,18,...,0.157895,0.131579,0.157895,0.026316,0.236842,1.5,1.5,1.8,2.5,3.0
1,"In a remarkable comparative analysis , Mandaea...",1,"[In, a, remarkable, comparative, analysis, Man...","[in, a, remarkable, comparative, analysis, man...","[remarkable, comparative, analysis, Mandaean, ...","[remarkable, comparative, analysis, mandaean, ...",21,0,3,7,...,0.136364,0.227273,0.136364,0.227273,0.227273,2.090909,2.045455,2.533333,3.2,2.2
2,"Before Persephone was released to Hermes , who...",1,"[Before, Persephone, was, released, to, Hermes...","[before, Persephone, be, release, to, Hermes, ...","[Persephone, released, Hermes, sent, retrieve,...","[Persephone, release, Hermes, send, retrieve, ...",40,0,4,23,...,0.142857,0.119048,0.142857,0.0,0.309524,1.619048,1.547619,2.210526,,2.6


### Textstat features

Textstat package can help us calculate some textual statistics and readability indexes at a very fast pace.

In [10]:
# some textual statistics

df1['char_per_word'] = df1['original_text'].apply(textstat.avg_character_per_word)
df1['sylla_count'] = df1['original_text'].apply(textstat.syllable_count)
df1['lexicon_count'] = df1['original_text'].apply(textstat.lexicon_count)
df1['char_count'] = df1['original_text'].apply(textstat.char_count)
df1['letter_count'] = df1['original_text'].apply(textstat.letter_count)
df1['poly_sylla'] = df1['original_text'].apply(textstat.polysyllabcount)            # count of 2+ syllable words
df1['mono_sylla'] = df1['original_text'].apply(textstat.monosyllabcount)            # count of 3+ syllable words

# readability indexes below

df1['flesch_ease'] = df1['original_text'].apply(textstat.flesch_reading_ease)
df1['smog_index'] = df1['original_text'].apply(textstat.smog_index)
df1['flesch_grad'] = df1['original_text'].apply(textstat.flesch_kincaid_grade)
df1['coleman_index'] = df1['original_text'].apply(textstat.coleman_liau_index)
df1['automated_index'] = df1['original_text'].apply(textstat.automated_readability_index)
df1['dalechall_score'] = df1['original_text'].apply(textstat.dale_chall_readability_score)
df1['difficult_words'] = df1['original_text'].apply(textstat.difficult_words)
df1['linsear_formula'] = df1['original_text'].apply(textstat.linsear_write_formula)
df1['gunning_fog'] = df1['original_text'].apply(textstat.gunning_fog)
df1['text_standard'] = df1['original_text'].apply(textstat.text_standard)

In [11]:
# turn the 'text_standard' variable into numeric feature

lst = []

for i in df1.text_standard:
    first = i.split(' ')[0]
    if len(first) == 3:
        num = int(first[:1])
    else:
        num = int(first[:2])
    lst.append(num)
    
df1['grade'] = pd.Series(lst)
df1.head(3)

Unnamed: 0,original_text,label,tokens,lemmas,non_stop_tokens,non_stop_lems,alpha_count,digit_count,punc_count,stopword_count,...,smog_index,flesch_grad,coleman_index,automated_index,dalechall_score,difficult_words,linsear_formula,gunning_fog,text_standard,grade
0,There is manuscript evidence that Austen conti...,1,"[There, is, manuscript, evidence, that, Austen...","[there, be, manuscript, evidence, that, Austen...","[manuscript, evidence, Austen, continued, work...","[manuscript, evidence, Austen, continue, work,...",35,3,5,18,...,0.0,16.9,8.77,19.0,12.59,7,26.5,20.46,16th and 17th grade,16
1,"In a remarkable comparative analysis , Mandaea...",1,"[In, a, remarkable, comparative, analysis, Man...","[in, a, remarkable, comparative, analysis, man...","[remarkable, comparative, analysis, Mandaean, ...","[remarkable, comparative, analysis, mandaean, ...",21,0,3,7,...,0.0,15.0,18.1,18.7,15.96,9,16.5,17.92,17th and 18th grade,17
2,"Before Persephone was released to Hermes , who...",1,"[Before, Persephone, was, released, to, Hermes...","[before, Persephone, be, release, to, Hermes, ...","[Persephone, released, Hermes, sent, retrieve,...","[Persephone, release, Hermes, send, retrieve, ...",40,0,4,23,...,0.0,18.5,10.52,22.3,11.73,9,27.0,20.61,11th and 12th grade,11


### Incorporate External Resource

Start from the dale_chall basic English words list.

In [12]:
f = open('dale_chall.txt', 'r')
text = f.read()
f.close()

basic = text.split('\n')
basic[0:5]

# a list of basic words in lowercase

['a', 'able', 'aboard', 'about', 'above ']

In [13]:
lst1 = []
lst2 = []

for i in range(len(df1)):
    count = 0
    non_basic = []
    
    # iterate through the tokens
    for w in df1.tokens[i]:
        if w.lower() not in basic:
            count += 1
            non_basic.append(w)
    lst1.append(count)
    lst2.append(non_basic)

df1['non_basic_tokens'] = pd.Series(lst2)    
df1['non_basic_count'] = pd.Series(lst1)
df1['non_basic_ratio'] = df1['non_basic_count'] / df1['token_count']

In [14]:
df1['nonbasic_avg_sylla'] = df1['non_basic_tokens'].apply(avg_sylla)

# how many non_basic tokens are above 5 syllables
df1['5sylla_nonbasic'] = df1.apply(lambda row: count_sylla_over(row['non_basic_tokens'],5), axis=1)
df1['nonbasic5_ratio'] = df1['5sylla_nonbasic'] / df1['token_count']

# the ratio of non-basic/non-stop words avg syllable to tokens average syllable
df1['sylla_ratio1'] = df1['nonbasic_avg_sylla'] / df1['token_avg_sylla']
df1['sylla_ratio2'] = df1['nonstop_avg_sylla'] / df1['token_avg_sylla']

df1['basic_count'] = df1['token_count'] - df1['non_basic_count']
df1['basic_ratio'] = df1['basic_count'] / df1['token_count']

df1['difficult_ratio'] = df1['difficult_words'] / df1['token_count']
df1['diff-basic_ratio'] = df1['difficult_words'] / df1['basic_count']

Concreteness Ratings:

In [15]:
df1['lemma_set'] = df1['lemmas'].apply(set)

dff = pd.read_csv('Concreteness_ratings.csv')

word_lst = list(dff.Word.values)
conc_lst = list(dff['Conc.M'].values)
known_lst = list(dff.Percent_known.values)
sub_lst = list(dff.SUBTLEX.values)

dff.head(2)

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.9,0,0


In [16]:
lst = []
lst1 = []
lst2 = []
lst3 = []

for i in range(len(df1)):
    basic_lemma = []
    ratings = []
    knowns = []
    subs = []

    for lemma in df1.lemma_set[i]:
        if lemma in word_lst:
            basic_lemma.append(lemma)
            ind = word_lst.index(lemma)
            ratings.append(conc_lst[ind])
            knowns.append(known_lst[ind])
            subs.append(sub_lst[ind])
    
    lst.append(basic_lemma)
    lst1.append(ratings)
    lst2.append(knowns)
    lst3.append(subs)

df1['basic_lemmas'] = pd.Series(lst)               # list of unique lemmas that are listed in the concreteness file
df1['lemma_rating'] = pd.Series(lst1)              # list of concreteness rating values of the above basic lemmas
df1['known_pct'] = pd.Series(lst2)                 # list of known-percentage values of the above basic lemmas
df1['lemma_subtlex'] = pd.Series(lst3)             # list of subtlex values of the above basic lemmas

In [17]:
df1['basic_lemma_count'] = df1['basic_lemmas'].apply(len)

# the ratio of the basic lemma count to the total unique lemma count & total tokens count
df1['basic_lemma_ratio'] = df1['basic_lemma_count'] / df1['unique_lemma_count']
df1['basic_lemma_ratio1'] = df1['basic_lemma_count'] / df1['token_count']

df1['nonbasic_lemma_count'] = df1['unique_lemma_count'] - df1['basic_lemma_count']
df1['nonbasic_lemma_ratio'] = df1['nonbasic_lemma_count'] / df1['unique_lemma_count']
df1['nonbasic_lemma_ratio1'] = df1['nonbasic_lemma_count'] / df1['token_count']

df1.head(3)

Unnamed: 0,original_text,label,tokens,lemmas,non_stop_tokens,non_stop_lems,alpha_count,digit_count,punc_count,stopword_count,...,basic_lemmas,lemma_rating,known_pct,lemma_subtlex,basic_lemma_count,basic_lemma_ratio,basic_lemma_ratio1,nonbasic_lemma_count,nonbasic_lemma_ratio,nonbasic_lemma_ratio1
0,There is manuscript evidence that Austen conti...,1,"[There, is, manuscript, evidence, that, Austen...","[there, be, manuscript, evidence, that, Austen...","[manuscript, evidence, Austen, continued, work...","[manuscript, evidence, Austen, continue, work,...",35,3,5,18,...,"[continue, the, addition, to, work, make, piec...","[2.36, 1.43, 2.89, 1.55, 3.48, 2.67, 4.14, 2.6...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.97, 1.0, 1.0,...","[2527, 1501908, 395, 1156570, 40699, 70775, 63...",22,0.733333,0.578947,8,0.266667,0.210526
1,"In a remarkable comparative analysis , Mandaea...",1,"[In, a, remarkable, comparative, analysis, Man...","[in, a, remarkable, comparative, analysis, man...","[remarkable, comparative, analysis, Mandaean, ...","[remarkable, comparative, analysis, mandaean, ...",21,0,3,7,...,"[related, to, of, comparative, remarkable, ana...","[2.56, 1.55, 1.67, 1.74, 1.89, 2.56, 1.46, 4.9...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.93, 1.0, 1.0,...","[643, 1156570, 590439, 41, 641, 563, 1041179, ...",14,0.666667,0.636364,7,0.333333,0.318182
2,"Before Persephone was released to Hermes , who...",1,"[Before, Persephone, was, released, to, Hermes...","[before, Persephone, be, release, to, Hermes, ...","[Persephone, released, Hermes, sent, retrieve,...","[Persephone, release, Hermes, send, retrieve, ...",40,0,4,23,...,"[underworld, who, the, six, accord, telling, t...","[2.96, 1.74, 1.43, 3.43, 1.57, 2.45, 1.55, 3.3...","[1.0, 1.0, 1.0, 1.0, 0.88, 1.0, 1.0, 0.97, 1.0...","[181, 113370, 1501908, 10176, 83, 9730, 115657...",29,0.852941,0.690476,5,0.147059,0.119048


Age of Acquisition :

In [18]:
ddf = pd.read_csv('AoA_words.csv',encoding= 'unicode_escape')

word_lst1 = list(ddf.Word.values)
aoa_lst = list(ddf['AoA_Kup_lem'].values)
known_lst1 = list(ddf.Perc_known_lem.values)

ddf.head(2)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
0,a,a,20415.27,Article,1,1,1,a,2.89,1.0,2.89,1.0,3.16,,,
1,aardvark,aardvark,0.41,Noun,8,7,2,aardvark,9.89,1.0,9.89,1.0,,,,


In [19]:
lst = []
lst1 = []
lst2 = []

for i in range(len(df1)):
    aoa_words = []
    ratings = []
    knowns = []

    for w in df1.tokens[i]:
        if w.lower() in word_lst1:
            aoa_words.append(w)
            ind = word_lst1.index(w.lower())
            ratings.append(aoa_lst[ind])
            knowns.append(known_lst1[ind])

    lst.append(aoa_words)
    lst1.append(ratings)
    lst2.append(knowns)

df1['word_in_AOA'] = pd.Series(lst)                       # words listed in the age of acquisition file
df1['word_aoa_scores'] = pd.Series(lst1)                  # list of AoA scores of the above words
df1['aoa_known_pct'] = pd.Series(lst2)                    # list of known-percentage values of the above words

df1['aoa_count'] = df1['word_in_AOA'].apply(len)
df1['aoa_ratio'] = df1['aoa_count'] / df1['token_count']

### Calculate some summary statistics from the information we extracted above

A lot of the columns are just lists of values for now. 

In [20]:
lst=[]
lst1=[]
lst2=[]

for i in range(len(df1)):
    if len(df1['lemma_rating'][i]) == 0:
        lst.append(0)
        lst1.append(0)
        lst2.append(0)
    
    else: 
        lst.append(np.mean(df1['lemma_rating'][i]))
        lst1.append(np.std(df1['lemma_rating'][i]))
        lst2.append(max(df1['lemma_rating'][i]) - min(df1['lemma_rating'][i]))

df1['avg_lemma_conc'] = pd.Series(lst)                # the average concreteness rating for the basic lemmas
df1['lemma_conc_std'] = pd.Series(lst1)               # the std of the basic lemmas concreteness ratings
df1['conc_max-min'] = pd.Series(lst2)                 # the max-min difference for lemma concreteness ratings

In [21]:
lst=[]
lst1=[]
lst2=[]

for i in range(len(df1)):
    if len(df1['known_pct'][i]) == 0:
        lst.append(0)
        lst1.append(0)
        lst2.append(0)
    
    else: 
        lst.append(np.mean(df1['known_pct'][i]))
        lst1.append(np.std(df1['known_pct'][i]))
        lst2.append(min(df1['known_pct'][i]))

df1['avg_lemma_known'] = pd.Series(lst)               # the average known percentage for the basic lemmas
df1['lemma_known_std'] = pd.Series(lst1)              # the std of the basic lemmas known percentage values
df1['lemma_min_known'] = pd.Series(lst2)              # the minimum value of the lemma known percentages

In [22]:
lst=[]
lst1=[]

for i in range(len(df1)):
    if len(df1['lemma_subtlex'][i]) == 0:
        lst.append(0)
        lst1.append(0)
    
    else: 
        lst.append(max(df1['lemma_subtlex'][i]))
        lst1.append(min(df1['lemma_subtlex'][i]))
        
df1['lemma_subtlex_max'] = pd.Series(lst)                # the max subtlex value for the basic lemmas
df1['lemma_subtlex_min'] = pd.Series(lst1)               # the min subtlex value for the basic lemmas

In [23]:
lst=[]
lst1=[]
lst2=[]

for i in range(len(df1)):
    if len(df1['word_aoa_scores'][i]) == 0:
        lst.append(0)
        lst1.append(0)
        lst2.append(0)
    
    else: 
        lst.append(np.mean(df1['word_aoa_scores'][i]))
        lst1.append(np.std(df1['word_aoa_scores'][i]))
        lst2.append(max(df1['word_aoa_scores'][i]))

df1['avg_word_aoa'] = pd.Series(lst)                   # the average age of acquisition score of the words
df1['word_aoa_std'] = pd.Series(lst1)                  # the std of the words' AoA scores
df1['word_aoa_max'] = pd.Series(lst2)                  # the maximum word AoA score in the sentence

In [24]:
lst=[]
lst1=[]
lst2=[]

for i in range(len(df1)):
    if len(df1['aoa_known_pct'][i]) == 0:
        lst.append(0)
        lst1.append(0)
        lst2.append(0)
    
    else: 
        lst.append(np.mean(df1['aoa_known_pct'][i]))
        lst1.append(np.std(df1['aoa_known_pct'][i]))
        lst2.append(min(df1['aoa_known_pct'][i]))

df1['aoa_known_avg'] = pd.Series(lst)                      # the words' average known percentage 
df1['aoa_known_std'] = pd.Series(lst1)                     # std of the words' known percentage values 
df1['aoa_known_min'] = pd.Series(lst2)                     # the minimum of words' known percentage

In [26]:
# Let's see what the data looks like right now

df1.head(3)

Unnamed: 0,original_text,label,tokens,lemmas,non_stop_tokens,non_stop_lems,alpha_count,digit_count,punc_count,stopword_count,...,lemma_known_std,lemma_min_known,lemma_subtlex_max,lemma_subtlex_min,avg_word_aoa,word_aoa_std,word_aoa_max,aoa_known_avg,aoa_known_std,aoa_known_min
0,There is manuscript evidence that Austen conti...,1,"[There, is, manuscript, evidence, that, Austen...","[there, be, manuscript, evidence, that, Austen...","[manuscript, evidence, Austen, continued, work...","[manuscript, evidence, Austen, continue, work,...",35,3,5,18,...,0.024427,0.89,1501908,137,5.923793,1.743145,12.12,0.992759,0.018175,0.94
1,"In a remarkable comparative analysis , Mandaea...",1,"[In, a, remarkable, comparative, analysis, Man...","[in, a, remarkable, comparative, analysis, man...","[remarkable, comparative, analysis, Mandaean, ...","[remarkable, comparative, analysis, mandaean, ...",21,0,3,7,...,0.020064,0.93,1156570,41,7.499286,3.002557,11.94,1.0,0.0,1.0
2,"Before Persephone was released to Hermes , who...",1,"[Before, Persephone, was, released, to, Hermes...","[before, Persephone, be, release, to, Hermes, ...","[Persephone, released, Hermes, sent, retrieve,...","[Persephone, release, Hermes, send, retrieve, ...",40,0,4,23,...,0.028515,0.88,1501908,13,5.322162,1.80071,11.17,0.995946,0.013648,0.95


### Other self-defined features 

Came up with these a bit later during the project, added them to the train data.

In [27]:
lst1=[]
lst2=[]

for i in range(len(df1)):
    text = df1['original_text'][i]
    lst1.append(alpha(text))
    lst2.append(num(text))

df1['alpha_length'] = pd.Series(lst1)
df1['num_length'] = pd.Series(lst2)
df1['other_length'] = df1['raw_length'] - df1['alpha_length'] - df1['num_length']

df1['alpha_ratio']=df1['alpha_length'] / df1['raw_length']
df1['num_ratio']=df1['num_length'] / df1['raw_length']
df1['other_ratio']=df1['other_length'] / df1['raw_length']

In [28]:
# count of 5+ syllables tokens
df1['5_sylla_token'] = df1.apply(lambda row: count_sylla_over(row['tokens'],5), axis=1)
df1['5sylla_ratio'] = df1['5_sylla_token'] / df1['token_count']

# count of 5+ syllables non-stopword tokens
df1['5_sylla_nonstop'] = df1.apply(lambda row: count_sylla_over(row['non_stop_tokens'],5), axis=1)
df1['5sylla_nonstop_ratio'] = df1['5_sylla_nonstop'] / df1['token_count']

In [29]:
df1['poly_ratio'] = df1['poly_sylla'] / df1['token_count']           # ratio of 2+ syllable token
df1['mono_ratio'] = df1['mono_sylla'] / df1['token_count']           # ratio of 3+ syllable token

# total syllable count for non-stopwords / pronouns / nonstop nouns / nonstop adjectives
# entities / noun chunks / non-basic words

df1['nonstop_syllables'] = df1.apply(lambda row: total_sylla(row['non_stop_tokens']), axis=1)
df1['pron_syllables'] = df1.apply(lambda row: total_sylla(row['pronouns']), axis=1)
df1['nonstop_n_syllables'] = df1.apply(lambda row: total_sylla(row['non_stop_nouns']), axis=1)
df1['nonstop_adj_syllables'] = df1.apply(lambda row: total_sylla(row['non_stop_adj']), axis=1)
df1['entity_syllables'] = df1.apply(lambda row: total_sylla(row['entities']), axis=1)
df1['nounchunk_syllables'] = df1.apply(lambda row: total_sylla(row['noun_chunks']), axis=1)
df1['nonbasic_syllables'] = df1.apply(lambda row: total_sylla(row['non_basic_tokens']), axis=1)

df1['nonstop_sylla_ratio'] = df1['nonstop_syllables'] / df1['sylla_count']
df1['pron_sylla_ratio'] = df1['pron_syllables'] / df1['sylla_count']
df1['nonstopN_sylla_ratio'] = df1['nonstop_n_syllables'] / df1['sylla_count']
df1['nonstopadj_sylla_ratio'] = df1['nonstop_adj_syllables'] / df1['sylla_count']
df1['entity_sylla_ratio'] = df1['entity_syllables'] / df1['sylla_count']
df1['nounchunk_sylla_ratio'] = df1['nounchunk_syllables'] / df1['sylla_count']
df1['nonbasic_sylla_ratio'] = df1['nonbasic_syllables'] / df1['sylla_count']

### Now we're done creating features, save the values as new training data.

In [30]:
df2 = df1.drop(columns=['original_text', 'label', 'tokens', 'lemmas', 'non_stop_tokens', 
                        'non_stop_lems', 'nouns', 'pronouns', 'non_stop_nouns', 'adjs', 'non_stop_adj', 
                        'entities', 'noun_chunks', 'text_standard', 'non_basic_tokens', 'lemma_set', 
                        'basic_lemmas', 'lemma_rating', 'known_pct', 'lemma_subtlex', 'word_in_AOA', 'word_aoa_scores', 
                        'aoa_known_pct'])

df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2.fillna(0, inplace=True)

# all the features (self-defined + textstat), 116 features
df2.to_csv('train_data.csv',index=False)

In [31]:
df3 = df2.drop(columns = ['char_per_word', 'sylla_count', 'lexicon_count', 'char_count', 'letter_count', 
                          'poly_sylla', 'mono_sylla', 'flesch_ease', 'smog_index', 'flesch_grad', 
                          'coleman_index', 'automated_index', 'dalechall_score', 'difficult_words', 
                          'linsear_formula', 'gunning_fog', 'grade'])

# the self-defined features only
df3.to_csv('train_self_feature.csv',index=False)

In [32]:
df4 = df2[['char_per_word', 'sylla_count', 'lexicon_count', 'char_count', 'letter_count', 
                          'poly_sylla', 'mono_sylla', 'flesch_ease', 'smog_index', 'flesch_grad', 
                          'coleman_index', 'automated_index', 'dalechall_score', 'difficult_words', 
                          'linsear_formula', 'gunning_fog', 'grade']]

# the textstat features only
df4.to_csv('textstat_value.csv',index=False)