### Preprocessing ASAP-AES

1. Preprocessing
2. Calculating NLP features from Uto et al. (2020)

In [2]:
import numpy as np
import pandas as pd

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

set3 = pd.read_csv('set3_features.csv')
# set3['lemmatized'] = set3['essay'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

In [2]:
data = pd.read_csv('training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [3]:
set3 = data[data['essay_set'] == 3]
set4 = data[data['essay_set'] == 4]
set5 = data[data['essay_set'] == 5]
set6 = data[data['essay_set'] == 6]

In [4]:
import re
import string
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import spacy
from spellchecker import SpellChecker
import textstat

nlp = spacy.load('en_core_web_sm')

spell = SpellChecker()
english = English()
tokenizer = Tokenizer(english.vocab)

def count_commas(text):
  count = 0  
  for i in range (0, len(text)):   
    if text[i] == ',':  
        count = count + 1
  return count

def count_exclamation_marks(text):
  count = 0 
  for i in range (0, len(text)):   
    if text[i] == '!':  
        count = count + 1
  return count

def count_question_marks(text):
    count = 0
    for i in range (0, len(text)):
        if text[i] == '?':  
            count = count + 1
    return count

def lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

def average_word_length(text):
    sentences = [sent.text for sent in nlp(text).sents]
    length_words = 0
    total_words = 0
    for sentence in sentences:
        words = tokenizer(sentence)
        for word in words:
            length_words += len(word)
            total_words += 1
    return length_words / total_words

def average_sentence_length(text):
    sentences = [sent.text for sent in nlp(text).sents]
    length_sentences = 0
    total_sentences = 0
    for sentence in sentences:
        length_sentences += len(tokenizer(sentence))
        total_sentences += 1
    return length_sentences / total_sentences

def number_of_nouns(text):
    # find number of nouns in text
    doc = nlp(text)
    pos = [token.pos_ for token in doc]
    return pos.count('NOUN') + pos.count('PROPN')

def number_of_verbs(text):
    # find number of verbs in text
    doc = nlp(text)
    pos = [token.pos_ for token in doc]
    return pos.count('VERB')

def number_of_adverbs(text):
    # find number of adverbs in text
    doc = nlp(text)
    pos = [token.pos_ for token in doc]
    return pos.count('ADV')

def number_of_adjectives(text):
    # find number of adjectives in text
    doc = nlp(text)
    pos = [token.pos_ for token in doc]
    return pos.count('ADJ')

def number_of_conjunctions(text):
    # find number of conjunctions in text
    doc = nlp(text)
    pos = [token.pos_ for token in doc]
    return pos.count('CCONJ')

def number_of_spelling_errors(text):
    misspelled = spell.unknown([token.text for token in tokenizer(text)])
    return len(misspelled)

def num_stopwords(text):
    # find number of stopwords in text
    doc = nlp(text)
    stop_words = [token.text for token in doc if token.is_stop]
    return len(stop_words)

### Features

<img src="./features.jpeg" width="600">

In [73]:
def generate_features(frame):
    data = frame.copy()
    # length-based features
    print('Calculating number of words...')
    data['num_words'] = data['essay'].apply(lambda x: len(x.split()))
    print('Calculating number of sentences...')
    data['num_sentences'] = data['essay'].apply(lambda x: len(list(nlp(x).sents)))
    print('Calculating number of lemmas...')
    data['num_lemmas'] = data['essay'].apply(lambda x: len(lemmatize(x)))
    print('Calculating number of commas...')
    data['num_commas'] = data['essay'].apply(lambda x: count_commas(x))
    print('Calculating number of exclamation marks...')
    data['num_exclamation_marks'] = data['essay'].apply(lambda x: count_exclamation_marks(x))
    print('Calculating number of question marks...')
    data['num_question_marks'] = data['essay'].apply(lambda x: count_question_marks(x))
    print('Calculating average word length...')
    data['average_word_length'] = data['essay'].apply(lambda x: average_word_length(x))
    print('Calculating average sentence length...')
    data['average_sentence_length'] = data['essay'].apply(lambda x: average_sentence_length(x))

    # synctatic features
    print('Calculating number of nouns...')
    data['num_nouns'] = data['essay'].apply(lambda x: number_of_nouns(x))
    print('Calculating number of verbs...')
    data['num_verbs'] = data['essay'].apply(lambda x: number_of_verbs(x))
    print('Calculating number of adjectives...')
    data['num_adjectives'] = data['essay'].apply(lambda x: number_of_adjectives(x))
    print('Calculating number of adverbs...')
    data['num_adverbs'] = data['essay'].apply(lambda x: number_of_adverbs(x))
    print('Calculating number of conjunctions...')
    data['num_conjunctions'] = data['essay'].apply(lambda x: number_of_conjunctions(x))

    # word-based features
    print('Calculating number of spelling errors...')
    data['num_spelling_errors'] = data['essay'].apply(lambda x: number_of_spelling_errors(x))
    print('Calculating number of stopwords...')
    data['num_stopwords'] = data['essay'].apply(lambda x: num_stopwords(x))

    # readability features
    print('Calculating readability features...')
    data['automated_readability_index'] = data['essay'].apply(lambda x: textstat.automated_readability_index(x))
    data['coleman_liau_index'] = data['essay'].apply(lambda x: textstat.coleman_liau_index(x))
    data['dale_chall_index'] = data['essay'].apply(lambda x: textstat.dale_chall_readability_score(x))
    data['difficult_word_count'] = data['essay'].apply(lambda x: textstat.difficult_words(x))
    data['flesch_kincaid_grade'] = data['essay'].apply(lambda x: textstat.flesch_kincaid_grade(x))
    data['gunning_fog'] = data['essay'].apply(lambda x: textstat.gunning_fog(x))
    data['linsear_write_formula'] = data['essay'].apply(lambda x: textstat.linsear_write_formula(x))
    data['smog_index'] = data['essay'].apply(lambda x: textstat.smog_index(x))
    data['syllables_count'] = data['essay'].apply(lambda x: textstat.syllable_count(x))
    
    print('done')
    return data

### Split into sets

In [74]:
def split_in_sets(data):
    essay_sets = []
    min_scores = []
    max_scores = []
    for s in range(1,9):
        essay_set = data[data["essay_set"] == s]
        essay_set.dropna(axis=1, inplace=True)
        n, d = essay_set.shape
        set_scores = essay_set["domain1_score"]
        print ("Set", s, ": Essays = ", n , "\t Attributes = ", d)
        min_scores.append(set_scores.min())
        max_scores.append(set_scores.max())
        essay_sets.append(essay_set)
    return (essay_sets, min_scores, max_scores)

In [75]:
essay_sets, data_min_scores, data_max_scores = split_in_sets(data)
set1, set2, set3, set4, set5, set6, set7, set8 = tuple(essay_sets)

Set 1 : Essays =  1783 	 Attributes =  7
Set 2 : Essays =  1800 	 Attributes =  10
Set 3 : Essays =  1726 	 Attributes =  7
Set 4 : Essays =  1770 	 Attributes =  7
Set 5 : Essays =  1805 	 Attributes =  7
Set 6 : Essays =  1800 	 Attributes =  7
Set 7 : Essays =  1569 	 Attributes =  15
Set 8 : Essays =  723 	 Attributes =  19


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [76]:
set3 = generate_features(set3)
set4 = generate_features(set4)
set5 = generate_features(set5)
set6 = generate_features(set6)

Calculating number of words...
Calculating number of sentences...
Calculating number of lemmas...
Calculating number of commas...
Calculating number of exclamation marks...
Calculating number of question marks...
Calculating average word length...
Calculating average sentence length...
Calculating number of nouns...
Calculating number of verbs...
Calculating number of adjectives...
Calculating number of adverbs...
Calculating number of conjunctions...
Calculating number of spelling errors...
Calculating number of stopwords...
Calculating readability features...
done
Calculating number of words...
Calculating number of sentences...
Calculating number of lemmas...
Calculating number of commas...
Calculating number of exclamation marks...
Calculating number of question marks...
Calculating average word length...
Calculating average sentence length...
Calculating number of nouns...
Calculating number of verbs...
Calculating number of adjectives...
Calculating number of adverbs...
Calculati

In [77]:
set3.to_csv('set3_features.csv', index=False)
set4.to_csv('set4_features.csv', index=False)
set5.to_csv('set5_features.csv', index=False)
set6.to_csv('set6_features.csv', index=False)

### Keyword Selection

In [4]:
def get_quotes(text):
    quotes = re.findall(r'"([^"]*)"', text)
    return quotes

In [105]:
set3_keywords = set()
for i in set3['essay']:
    unicode_converted = i.replace("\x93", '"').replace("\x94", '"')
    quotes = get_quotes(unicode_converted)
    if len(quotes) > 0:
        print(quotes, "\n")
        set3_keywords.update(quotes)

['I was traveling through the high deserts of California in June.', 'brackish water faling somewhere in the neighborhood of two hundred degrees', 'The sun was beginning to beat down', ' and the growing realization that I could drop from heatstroke on a gorgous day in June', 'About forty miles into the pedal, I arrived at the first \x91town\x92 but on that morning it fit the traditional definition of a ghost town.'] 

['town', 'One ramshackle sheds several rusty pumps and a coral that couldnt hold the lamesh mule.', 'In the story the speeker says ', ' that are very hot and dry, and he says '] 

['Rough Road Ahead: Do Not Exceed Posted Speed Limit', 'The water bottles contained only a few tantalizing sips.'] 

['Rough Road Ahead: Do Not Exceed Posted Speed Limit', 'the sun was beginning to beat down.', 'The water bottle contained only dehydration which can cause death. Joe Kurmaskie\x92s story, '] 

[' shortcut', ' Flat road was repalced by short rolling hills.', ' I wiped the sweat from

In [107]:
len(set3_keywords)

1512

In [1]:
set3quotes = ['traveling through the high deserts of California in June', 'brackish water faling somewhere in the neighborhood of two hundred degrees', 'sun was beginning to beat down', 'growing realization that I could drop from heatstroke on a gorgous day in June', 'fit the traditional definition of a ghost town', 'trying to keep my balance in my dehydrated state', 'flat road was replaced by short rolling hills', 'water bottle contained only a few tantalizing sips', 'tarlike substance followed by brackish water', 'no one in sight, not a building, car, or structure of any kind', 'wide rings of dried sweat circled my shirt']

set3essayquotes = ['enjoyed the serenity of an early-summer evening', 'thriving little spot at one time', 'hitting my water bottles pretty regularly', 'high deserts of California', 'somewhere in the neighborhood of two hundred degrees', 'flat road was replaced by short rolling hills', 'ROUGH ROAD AHEAD: DO NOT EXCEED POSTED SPEED LIMIT', 'water bottles contained only a few tantalizing sips', 'Wide rings of dried sweat circled my shirt', 'drop from heatstroke on a gorgeous day', 'no one in sight, not a building, car, or structure of any kind', 'long, crippling hill', 'checked my water supply', 'birds would pick me clean']

set3_keywords = set(set3quotes + set3essayquotes)

In [5]:
list(set3_keywords)

['trying to keep my balance in my dehydrated state',
 'no one in sight, not a building, car, or structure of any kind',
 'flat road was replaced by short rolling hills',
 'hitting my water bottles pretty regularly',
 'somewhere in the neighborhood of two hundred degrees',
 'sun was beginning to beat down',
 'drop from heatstroke on a gorgeous day',
 'water bottles contained only a few tantalizing sips',
 'high deserts of California',
 'enjoyed the serenity of an early-summer evening',
 'traveling through the high deserts of California in June',
 'ROUGH ROAD AHEAD: DO NOT EXCEED POSTED SPEED LIMIT',
 'long, crippling hill',
 'tarlike substance followed by brackish water',
 'thriving little spot at one time',
 'fit the traditional definition of a ghost town',
 'Wide rings of dried sweat circled my shirt',
 'growing realization that I could drop from heatstroke on a gorgous day in June',
 'water bottle contained only a few tantalizing sips',
 'wide rings of dried sweat circled my shirt',


In [5]:
set4 = pd.read_csv('set4_features.csv')

In [6]:
set4_keywords = set()

for i in set4['essay']:
    unicode_converted = i.replace("\x93", '"').replace("\x94", '"').replace("\x85",  '')
    quotes = get_quotes(unicode_converted)
    if len(quotes) > 0:
        print(quotes, "\n")
        set4_keywords.update(quotes)

set4_keywords

['Winter Hibiscus'] 

['when they come back, saeng vowed silently to herself, in the spring, when the snows melt and the geese return and this hibiscus is budding, then I will take that test again.'] 

['Winter Hibiscus,', 'many of the things that she had thought of as strange before had become, almost familiar to her now.'] 

['Almost reluctantly, she realized that many of the things that she had thought of as strange before had become, through the quiet repetition of season upon season, almost familiar to her now.', 'familiar'] 

['Not like the kind we had before.'] 

['I-I failed the test.', 'take that test again.', ' And that rich sweet scent- that was familiar, too, ', 'take the test again.', 'the quiet repetition of season opon season, almost familiar to her now. Like the geese.', ' when the snow melts and the geese return.'] 

[' Winter Hibiscus,', ' When they come back, saeng vowed silently to herself, in the spring when the snow melt and the geese return and this hibiscus is b

{'',
 ' ',
 '  Winter Hibiscus ',
 '  because  the  author  started  out  the  story  after  Saeng  failed  her  driving  test  .  Saeng  is  reminded  from  the  Hibiscus  about  her  homeland.  After  Saeng  wanted  the  Hibiscus  in  the  garden  ,',
 ' (Paragraph Minfong Ho), @CAPS1 examples ',
 ' ... I will take the test again.',
 ' ...and this hibiscus is budding, then I will take that test again.',
 ' @CAPS1  of loss so deep ',
 ' @CAPS1 facing. Na where did you gott that',
 ' @CAPS1 then she will become more familiar. For example, the author says, ',
 ' A wave of loss so deep and strong that it stung Saeng\x92s eye now swept over her.',
 ' A work so deep and strong that it stung Saeng\x92s eyes now swept over her.',
 ' Almost reluctantly, she realized that many of the things that she had thought of as strange before had become, through the quite repetition of season upon season, almost familiar to her now.',
 ' And that rich sweet scent- that was familiar, too, ',
 ' And the re

In [30]:
set4essay_quotes = ['will take that test again', 'many of the things that she had thought of as strange', 'not like the kind we had before', 'I failed the test', 'rich sweet scent', 'when the snow melts', 'geese return', 'hibiscus is budding', 'gentle grandmother', 'distinctive V was etched against the evening sky', 'familar fragrance filled her lungs', 'could almost feel light strands of her grandmother long gray hair', 'attitude towards her new country and her driving test', 'hibiscus plant in the winter is not as beautiful in the bitter cold', 'adapts and survives', 'returns to its beautiful state in the spring', 'bitter about her new country and driving test', 'new start or new opportunity', 'memories of home', 'overcoming her obstacles', 'noticed tear stains on her a daughter cheeks and her puffy eyes', 'symbolize change and adoption', 'make it through the winter into the spring', 'life is blooming', 'she was still too shaky to say the words at home', 'bitter melon']

In [29]:
set4[set4['domain1_score'] == 3]['essay'].sample(1).iloc[0]

'The author concludes the story with this last paragraph because it shows by spring she will be ready to retake her test. It shows that like the geese who care and come back, she will have time to study and pass her test. In the story the author includes this last paragraph to truly show her dedication to eventually grow to be comfortable in her new home and to pass her drivers test all in good time. Saeng wants to be comfortable with her new life but isn\x92t yet because of what one misses. But she knows that after this she can go back and try again just like the geese do often our winter. This was a very effective way to end the story because it really communicated Saeng\x92s message of dedication and longing to overcome her obstacles, her new home and driving.  '

In [23]:
len(set4essay_quotes)

21

In [31]:
set5 = pd.read_csv('set5_features.csv')

In [32]:
set5_keywords = set()

for i in set5['essay']:
    unicode_converted = i.replace("\x93", '"').replace("\x94", '"').replace("\x85",  '')
    quotes = get_quotes(unicode_converted)
    if len(quotes) > 0:
        print(quotes, "\n")
        set5_keywords.update(quotes)

set5_keywords

[' family'] 

['despite customs elsewhere, all of these cultures came together in solidarity and friendship', 'as many immagrants do, to give their children a better life', 'Mother and father had come to this country with such courage, without any knowledge of the language or the culture'] 

['family'] 

['I will never forget how my parents turned this simple house into a home', 'I will be grateful to my parents for their love and sacrifice', 'My mother and father came to this country with such courage, without any knowledge of the language or the culture'] 

['Home: The Blueprints of Our Lives', 'Here, the innocence of childhood, the congregation of family and friends, and endless celebrations that encompassed both, formed the backdrop to life in our warm houses.', "It was here where I learned the real definition of 'family'... I will never forget how my parents turned the simple house into a home.", 'Home: The Blueprints of Our Lives'] 

['I will never forget how my parents turned th

{'',
 'your home',
 'my young parents created our traditional Cuban home... Passionate Cuban music (which I adore to this day) filled the air mixing with the aromas of the kitchen.',
 'a time when overt racism was the norm and segregation prevailed in the United States. In our neighborhood, despite customs elsewhere, all of these cultures came together in great solidarity and friendship. It was a close \x96 knit community of honest, hardworking immigrants who extended a hand to people who, while not necessarily their own kind, were clearly in need',
 'But, in reality, there is no way to express my gratitude for the spirit of generosity impressed upon me at such an early age and demonstration of how important family and friends are.',
 'the endless celebrations',
 'in reality, there is no way to express my gratitude for the spirit of generousity impressed upon me at such an early age and the demonstration of how important family and friends are.',
 'Home: The Blueprints of Our Lives',
 

In [33]:
set5essay_quotes = ['always be grateful to my parents for their love and sacrifice', 'rich culinary skills', 'love of cooking', 'passionate Cuban music', 'aromas of the kitchen', 'innocence of childhood', 'congregation of family and friends', 'endless celebrations', 'our warm home', 'came together in great solidarity and friendship', 'close-knit community of honest, hardworking immigrants', 'kept their arms and door open to the many people we considered family', 'came selflessly', 'struggled both personally and financially', 'facing cultural hardships', 'overt racism was the norm', 'drove them to endure these hard times', 'their strength and perseverance', 'love and sacrifice', 'spirit of generosity impressed upon me at such an early age', 'demonstration of how important family and friends are', 'teachings have been basis of my life', 'warmth of the kitchen', 'humble house', 'not just scent and music but life and love', 'definition of family', 'never forget how my parents turned this simple house into a home']

In [36]:
set6 = pd.read_csv('set6_features.csv')
set6['num_words'].mean()

153.29833333333335

In [43]:
set6_keywords = set()

for i in set6[set6['domain1_score'] == 4]['essay']:
    unicode_converted = i.replace("\x93", '"').replace("\x94", '"').replace("\x85",  '')
    quotes = get_quotes(unicode_converted)
    if len(quotes) > 0:
        print(quotes, "\n")
        set6_keywords.update(quotes)

set6_keywords

['When the German dirigible Hindenburg was destroyed by fire in Lakehurst, New Jersey, on May 6, 1937, the owners of the Empire State Building realized how much worse that accident could have been if it had taken place over a densely populated area such as downtown New York.', 'Even if the dirigible were tethered to the mooring mast, the back of the ship which swivel around and around the mooring mast.', 'This law would make it illegal for a ship to ever tie up to a building or even approach the area.', 'By the late 1930s, the idea of using the mooring mast for dirigibles and their passengers had quietly disappeared.'] 

["The steel frame of the Empire State Building would have to be modified and strengthened to accommodate this new situation. Over @MONEY1 worth of modifications had to be made to the building's framework.", 'dangling high above pedestrians on the street'] 

["The stress of the dirigible's load and wind pressure would have to be transmitted all the way to the building's

{' (@CAPS2 @NUM2).',
 ' (paragraph @NUM1). Another obstacle was the existing law against low-flying airships in urban areas. This law makes is so that it is ',
 ' (paragraph @NUM2). Another obstacle was that most dirigibles outside of the @LOCATION2 use hydrogen, which is very flammable. When the ',
 ' (paragraph @NUM2). The greatest obstacle of all however, was mother nature. The winds that high in the sky were ',
 ' (paragraph @NUM3). The most significant obstacle was ',
 ' (paragraph @NUM4) in open areas, dirigibles could be weighted down by lead weights, but this poses a threat to pedestrians on the street, as the weights would be dangling above them. Lastly, there was ',
 ' (paragraph @NUM4). Due to violent air currents, ',
 ' @CAPS1 obstacles did that bring? Well, hydrogen is highly flammable. When a German dirigible was destroyed by a fire in New Jersey, the owners of the ',
 ' @CAPS1 the wind would make the dirigible swivel around and around the mooring mast. An idea came into 

In [42]:
set6[set6['domain1_score'] == 4]['essay'].sample(1).iloc[0]

"The dream for the Empire State Building was to allow dirigibles to dock On the top of it. It sounded like a great idea but the builders faced many obstacles trying to reach the goal of having them dock there. The first problem they faced was that dirigible's load and the wind pressure would have to be transmitted to the buildings foundation so it wouldn't stress the building's frame. To do this over sixty thousand dollars worth of modifications had been made. A glass and chrome nickel steel tower was made with a illuminated inside. The building was now 102 floors. Even though all the hard work the building was never used for docking purposes because of safety. One of the safety obstacles was that dirigibles wer highly flamable, after one caught on fire in New Jersey the owners of the Empire State building didn't want the same to happy in the highly populated New York City. Also another big obstacle was caused by the terrible winds. Even though there was a landing sight the dirigible c

In [44]:
set6essay_quotes = ['one of safety', 'dirigibles from outside of the United States used hydrogen instead of helium', 'nature itself', 'winds on top of the building were constantly shifting', 'violent air currents', 'law against airships flying too low over urban areas', 'moored in open landing fields', 'could be weighted down in the back with lead weights', 'dangling high above pedestrians on the street was neither practical nor safe', 'swivel around and around the mooring mast', 'how much worse that accident could have been', 'could not simply drop a mooring mast on top of the empire state building flat roof', 'stress of the dirigible load', 'mooring air ships to a fixed mast', 'neither practical nor safe']

In [45]:
max([len(i.split()) for i in set6essay_quotes])

16

In [3]:
import pandas as pd
set3 = pd.read_csv('set3_features.csv')
set4 = pd.read_csv('set4_features.csv')

In [2]:
set3['domain1_score'].value_counts()

2    657
1    607
3    423
0     39
Name: domain1_score, dtype: int64

In [4]:
set4['domain1_score'].value_counts()

1    636
2    570
0    311
3    253
Name: domain1_score, dtype: int64