In [None]:
import numpy as np
import pandas as pd
import re

# Feature Extraction

In [None]:
sample_text = "[['Street', 'signs', 'and', 'markings', 'are', 'all', 'around', 'us', '.'], ['It', 'is', 'important', 'to', 'pay', 'attention', 'to', 'them', '.'], ['They', 'help', 'keep', 'us', 'safe', '.'], ['Learn', 'about', 'some', 'of', 'them', 'here', '.'], ['Safe', 'to', 'Cross'], ['Crosswalks', 'tell', 'us', 'where', 'to', 'cross', 'the', 'street', '.'], ['Wait', 'Your', 'Turn'], ['An', 'orange', 'hand', 'tells', 'walkers', 'to', 'stop', '.'], ['It', 'is', 'not', 'safe', 'to', 'cross', 'a', 'street'], ['when', 'you', 'see', 'this', 'sign', '.'], ['Time', 'to', 'Go'], ['This', 'sign', 'means'], ['it', 'is', 'safe', 'to', 'cross', '.'], ['Cars', 'are', 'supposed', 'to', 'wait', '.'], ['But', 'you', 'should', 'always', 'look', 'both', 'ways'], ['before', 'crossing', 'a', 'street', '.'], ['A', 'Sign'], ['You', 'Can', 'Feel'], ['Have', 'you', 'ever', 'noticed', 'a', 'bumpy', 'strip', '?'], ['It', 'warns'], ['that', 'the', 'pavement', 'is', 'changing', 'or', 'ending', '.'], ['It', 'helps', 'people'], ['who', 'have', 'trouble', 'seeing', '.'], ['Did', 'You', 'Know', '?'], ['A', 'person'], ['who', 'is', 'walking', 'is', 'called', 'a', 'pedestrian', '.'], ['Some', 'street', 'signs', 'use', 'that', 'word', '.'], ['See', 'if', 'you', 'can', 'spot', 'the', 'word', 'pedestrian', 'next', 'time'], ['you', 'are', 'out', '!']]"
sample_tree = {'score': -44.713087843545416, 'tree': '(ROOT (nucleus:span (nucleus:span (nucleus:span (nucleus:span (nucleus:span (text 0)) (satellite:elaboration (nucleus:span (text 1)) (satellite:elaboration (nucleus:span (text 2)) (satellite:elaboration (nucleus:span (text 3)) (satellite:elaboration (text 4)))))) (satellite:elaboration (nucleus:span (text 5)) (satellite:elaboration (nucleus:span (text 6)) (satellite:elaboration (nucleus:span (nucleus:span (text 7)) (satellite:elaboration (nucleus:span (nucleus:span (text 8)) (satellite:background (text 9))) (satellite:elaboration (text 10)))) (satellite:elaboration (satellite:contrast (nucleus:span (satellite:attribution (text 11)) (nucleus:span (text 12))) (satellite:elaboration (text 13))) (nucleus:span (nucleus:span (text 14)) (satellite:elaboration (text 15)))))))) (satellite:elaboration (nucleus:span (text 16)) (satellite:elaboration (text 17)))) (satellite:elaboration (nucleus:span (text 18)) (satellite:elaboration (nucleus:span (satellite:attribution (text 19)) (nucleus:span (text 20))) (satellite:elaboration (nucleus:span (text 21)) (satellite:elaboration (text 22)))))) (satellite:elaboration (satellite:attribution (text 23)) (nucleus:span (nucleus:span (text 24)) (satellite:elaboration (nucleus:span (nucleus:span (text 25)) (satellite:elaboration (text 26))) (satellite:elaboration (nucleus:span (text 27)) (satellite:elaboration (text 28)))))))'}


In [None]:
sample_tree['tree']

'(ROOT (nucleus:span (nucleus:span (nucleus:span (nucleus:span (nucleus:span (text 0)) (satellite:elaboration (nucleus:span (text 1)) (satellite:elaboration (nucleus:span (text 2)) (satellite:elaboration (nucleus:span (text 3)) (satellite:elaboration (text 4)))))) (satellite:elaboration (nucleus:span (text 5)) (satellite:elaboration (nucleus:span (text 6)) (satellite:elaboration (nucleus:span (nucleus:span (text 7)) (satellite:elaboration (nucleus:span (nucleus:span (text 8)) (satellite:background (text 9))) (satellite:elaboration (text 10)))) (satellite:elaboration (satellite:contrast (nucleus:span (satellite:attribution (text 11)) (nucleus:span (text 12))) (satellite:elaboration (text 13))) (nucleus:span (nucleus:span (text 14)) (satellite:elaboration (text 15)))))))) (satellite:elaboration (nucleus:span (text 16)) (satellite:elaboration (text 17)))) (satellite:elaboration (nucleus:span (text 18)) (satellite:elaboration (nucleus:span (satellite:attribution (text 19)) (nucleus:span (t

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

def count_occurances(inp_str, sub_str):
  """
  Count occurances of certain sub-strings
  """

  res = sum(1 for i in range(len(inp_str)) 
          if inp_str.startswith(sub_str, i))
  return res

def get_max_depth(inp_str):
  """
  Gives the maximum length of a tree branch (depth of tree)
  """

  pat = r'(?<=\()(.*?)(?=\:)'
  matches = re.findall(pat, inp_str)
  brackets = []
  for ele in matches:
    brackets.append(ele.count(')'))

  try:
    return max(brackets)
  
  except:
    return 0

def get_sat_nuc_types(inp_str, show_dist = False):

  """
  Returns a list of nucleui and satellite types and creates a 
  pie-chart of the distribution
  """

  pat = r'(?<=\:)(.*?)(?=\s)'
  matches = re.findall(pat, inp_str)

  if show_dist:
    vis_dict = Counter(matches)
    # Data to plot
    labels = []
    sizes = []

    for x, y in vis_dict.items():
        labels.append(x)
        sizes.append(y)

    # Plot
    plt.pie(sizes, labels=labels, autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.2)

    plt.axis('equal')
    plt.show()

  return matches

def avg_words_per_sent(inp_str):
  punct_count = 0
  for st in inp_str:
    for ele in st:
      if ele == '.' or ele == '?' or ele == '!':
        punct_count += 1
  
  total_sent_length = len([item for sublist in inp_str for item in sublist])

  return total_sent_length/punct_count


In [None]:
import ast

def create_row(segmented_str, tree_str, unique_types = None):
  
  """
  segmented_str is a string containing a list of lists
  tree_str is a string representing the binary tree
  unique_types is a set containing all unique satellite types

  This functions creates a single row for the final numeric dataset for regression or clustering
  """

  segmented_str = ast.literal_eval(segmented_str)

  # Basic stats from tree
  num_satellites = count_occurances(tree_str, 'satellite')
  num_nuclei = count_occurances(tree_str, 'nucleus')
  max_depth = get_max_depth(tree_str)

  # Segmented text stats
  segment_lengths = [len(segment) for segment in segmented_str]
  avg_edu_length = np.mean(segment_lengths)
  std_edu_length = np.std(segment_lengths)

  total_sent_length = len([item for sublist in segmented_str for item in sublist])
  avg_words = avg_words_per_sent(segmented_str)

  # Normalized stats for tree
  norm_sats = num_satellites/total_sent_length
  norm_nuclei = num_nuclei/total_sent_length
  
  # Get distribution of elaborations
  labels = get_sat_nuc_types(tree_str)
  proportion_of_elaboration = labels.count('elaboration')/(len(labels) - labels.count('span'))

  counts = dict(Counter(labels))

  ini_dict = {'satellite_count': num_satellites,
              'nuclei_count' : num_nuclei,
              'tree_depth': max_depth,
              'average_edu_length': avg_edu_length,
              'std_deviation_of_length': std_edu_length,
              'normalized_satellite_count': norm_sats,
              'normalized_nuclei_count': norm_nuclei,
              'elaboration_proportion': proportion_of_elaboration,
              'total_sentence_length': total_sent_length,
              'average_words_per_sentence': avg_words,
              'number_of_unique_satellite_types': len(set(labels))}

  req_dict = dict(counts)
  req_dict.update(ini_dict)

  return req_dict 

# Add number of EDUs
# Add raw counts for satellite labels


In [None]:
create_row(sample_text, sample_tree['tree'], unique_types = None)

{'attribution': 3,
 'average_edu_length': 5.655172413793103,
 'average_words_per_sentence': 9.647058823529411,
 'background': 1,
 'contrast': 1,
 'elaboration': 23,
 'elaboration_proportion': 0.8214285714285714,
 'normalized_nuclei_count': 0.17073170731707318,
 'normalized_satellite_count': 0.17073170731707318,
 'nuclei_count': 28,
 'number_of_unique_satellite_types': 5,
 'satellite_count': 28,
 'span': 28,
 'std_deviation_of_length': 2.4533701250063116,
 'total_sentence_length': 164,
 'tree_depth': 8}

In [None]:
inp_df = pd.read_excel('DOGO.xlsx')
inp_df
inp_df = pd.read_excel('times.xlsx')
inp_df

Unnamed: 0.1,Unnamed: 0,edu_tokens,grade_level,scored_rst_trees
0,0,"[['Street', 'signs', 'and', 'markings', 'are',...",1,"[{'score': -44.713087843545416, 'tree': '(ROOT..."
1,1,"[['How', 'do', 'roads', 'appear', '?'], ['It',...",1,"[{'score': -32.326246146386005, 'tree': '(ROOT..."
2,2,"[['Meet', 'Samuel', 'Ramsey', '.'], ['His', 'n...",1,"[{'score': -41.83879991844082, 'tree': '(ROOT ..."
3,3,"[['It', 'is', 'hard', 'for', 'Dr.', 'Sammy', '...",1,"[{'score': -27.379387204041947, 'tree': '(ROOT..."
4,4,"[['Rows', 'of', 'Books'], ['Libraries', 'let',...",1,"[{'score': -46.198203768371215, 'tree': '(ROOT..."
...,...,...,...,...
1437,1437,"[['Rising', 'temperatures', '.'], ['Intense', ...",3,"[{'score': -30.111647738918084, 'tree': '(ROOT..."
1438,1438,"[['Jim', 'Henson', ',', 'the', 'creator', 'of'...",3,"[{'score': -37.0389493424066, 'tree': '(ROOT (..."
1439,1439,"[['Rodney', 'Robinson', 'will', 'tell', 'you',...",3,"[{'score': -113.73556262834782, 'tree': '(ROOT..."
1440,1440,"[['The', 'iconic', 'Notre-Dame', ',', 'in', 'P...",3,"[{'score': -39.74116747859713, 'tree': '(ROOT ..."


In [None]:
def combine_str(st):
  k = ast.literal_eval(st)
  j = []
  for ele in k:
    for it in ele:
      j.append(it)

  return ' '.join(j)

inp_df['document'] = inp_df['edu_tokens'].apply(combine_str)
inp_df

Unnamed: 0.1,Unnamed: 0,edu_tokens,grade_level,scored_rst_trees,document
0,0,"[['Street', 'signs', 'and', 'markings', 'are',...",1,"[{'score': -44.713087843545416, 'tree': '(ROOT...",Street signs and markings are all around us . ...
1,1,"[['How', 'do', 'roads', 'appear', '?'], ['It',...",1,"[{'score': -32.326246146386005, 'tree': '(ROOT...",How do roads appear ? It is not by magic . A t...
2,2,"[['Meet', 'Samuel', 'Ramsey', '.'], ['His', 'n...",1,"[{'score': -41.83879991844082, 'tree': '(ROOT ...",Meet Samuel Ramsey . His nickname is Dr. Sammy...
3,3,"[['It', 'is', 'hard', 'for', 'Dr.', 'Sammy', '...",1,"[{'score': -27.379387204041947, 'tree': '(ROOT...",It is hard for Dr. Sammy to pick which insect ...
4,4,"[['Rows', 'of', 'Books'], ['Libraries', 'let',...",1,"[{'score': -46.198203768371215, 'tree': '(ROOT...",Rows of Books Libraries let you borrow books ....
...,...,...,...,...,...
1437,1437,"[['Rising', 'temperatures', '.'], ['Intense', ...",3,"[{'score': -30.111647738918084, 'tree': '(ROOT...",Rising temperatures . Intense droughts . Melti...
1438,1438,"[['Jim', 'Henson', ',', 'the', 'creator', 'of'...",3,"[{'score': -37.0389493424066, 'tree': '(ROOT (...","Jim Henson , the creator of the Muppets , rele..."
1439,1439,"[['Rodney', 'Robinson', 'will', 'tell', 'you',...",3,"[{'score': -113.73556262834782, 'tree': '(ROOT...",Rodney Robinson will tell you his students are...
1440,1440,"[['The', 'iconic', 'Notre-Dame', ',', 'in', 'P...",3,"[{'score': -39.74116747859713, 'tree': '(ROOT ...","The iconic Notre-Dame , in Paris , France , wa..."


In [None]:
# Run this only for the TIMES dataset
inp_df = inp_df.drop(inp_df.index[[561]])

In [None]:
inp_df.edu_tokens[3], create_row(inp_df.edu_tokens[2], ast.literal_eval(inp_df.scored_rst_trees[2])[0]['tree'])

("[['It', 'is', 'hard', 'for', 'Dr.', 'Sammy', 'to', 'pick'], ['which', 'insect'], ['he', 'likes', 'best', '.'], ['He', 'has', 'so', 'many', 'favorites', '!'], ['Here', 'are', 'a', 'few', 'of', 'them', '.'], ['Honeybee'], ['Honeybees', 'sip', 'nectar', 'from', 'flowers', '.'], ['Bees', 'live', 'in', 'colonies', '.'], ['They', 'communicate', 'with', 'one', 'another', 'by', 'dancing', '.'], ['Preying', 'Mantis'], ['Praying', 'mantises', 'are', 'predators', '.'], ['They', 'eat', 'other', 'insects', '.'], ['Sometimes', ',', 'they', 'eat', 'bigger', 'creatures', ',', 'such', 'as', 'mice', 'and', 'snakes', '.'], ['Beetles'], ['There', 'are', 'thousands', 'of', 'different', 'kinds', 'of', 'beetles', '.'], ['These', 'insects', 'have', 'a', 'hard', 'outer', 'shell', '.'], ['It', 'helps', 'protect', 'them', '.']]",
 {'attribution': 3,
  'average_edu_length': 6.666666666666667,
  'average_words_per_sentence': 8.421052631578947,
  'elaboration': 20,
  'elaboration_proportion': 0.8695652173913043,


In [None]:
from tqdm import tqdm
features = []
sats = []
for i, row in tqdm(inp_df.iterrows()):
  extracted_tree_str = ast.literal_eval(row['scored_rst_trees'])[0]['tree']
  features.append(create_row(row['edu_tokens'], extracted_tree_str))
  sats.append(get_sat_nuc_types(extracted_tree_str, show_dist = False))

1441it [00:03, 372.36it/s]


In [None]:
set([item for sublist in sats for item in sublist])


{'attribution',
 'background',
 'cause',
 'comparison',
 'condition',
 'contrast',
 'elaboration',
 'enablement',
 'explanation',
 'joint',
 'mannermeans',
 'same-unit',
 'span',
 'summary',
 'temporal',
 'textualorganization'}

In [None]:
feature_df = pd.DataFrame(features)
feature_df = feature_df.fillna(0)
feature_df

Unnamed: 0,span,elaboration,background,contrast,attribution,satellite_count,nuclei_count,tree_depth,average_edu_length,std_deviation_of_length,...,joint,condition,explanation,mannermeans,same-unit,summary,cause,textualorganization,temporal,comparison
0,28,23,1.0,1.0,3.0,28,28,8,5.655172,2.453370,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21,19,0.0,0.0,2.0,21,21,8,6.136364,2.159450,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,23,20,0.0,0.0,3.0,23,23,5,6.666667,2.939199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,14,0.0,0.0,2.0,16,16,6,5.588235,3.049760,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,25,25,0.0,0.0,0.0,25,25,8,5.807692,2.094301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,17,14,0.0,0.0,0.0,15,17,7,7.000000,4.537426,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1437,23,18,1.0,0.0,4.0,23,33,5,7.068966,3.814049,...,6.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
1438,73,58,0.0,1.0,10.0,73,85,26,8.687500,5.622708,...,6.0,1.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0
1439,27,22,0.0,0.0,4.0,27,29,8,9.068966,4.274471,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
final_feature_df = feature_df.copy()
final_feature_df['grade_level'] = inp_df.grade_level

#final_feature_df.to_csv('dogo_features.csv')
#final_feature_df.to_csv('times_features.csv')

In [None]:
final_feature_df.iloc[0]

span                                 28.000000
elaboration                          23.000000
background                            1.000000
contrast                              1.000000
attribution                           3.000000
satellite_count                      28.000000
nuclei_count                         28.000000
tree_depth                            8.000000
average_edu_length                    5.655172
std_deviation_of_length               2.453370
normalized_satellite_count            0.170732
normalized_nuclei_count               0.170732
elaboration_proportion                0.821429
total_sentence_length               164.000000
average_words_per_sentence            9.647059
number_of_unique_satellite_types      5.000000
enablement                            0.000000
joint                                 0.000000
condition                             0.000000
explanation                           0.000000
mannermeans                           0.000000
same-unit    

In [None]:
!pip install textstat



In [None]:
import textstat
inp_df['flesch_reading_score'] = inp_df['document'].apply(textstat.flesch_reading_ease)
inp_df['flesch_grade'] = inp_df['document'].apply(textstat.flesch_kincaid_grade)


In [None]:
final_feature_df['flesch_reading_score'] = inp_df['flesch_reading_score']
final_feature_df['flesch_grade'] = inp_df['flesch_grade']
final_feature_df.to_csv('features+flesch_grades.csv')

In [None]:
final_feature_df

Unnamed: 0,span,elaboration,background,contrast,attribution,satellite_count,nuclei_count,tree_depth,average_edu_length,std_deviation_of_length,...,mannermeans,same-unit,summary,cause,textualorganization,temporal,comparison,grade_level,flesch_reading_score,flesch_grade
0,28,23,1.0,1.0,3.0,28,28,8,5.655172,2.453370,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,96.59,1.9
1,21,19,0.0,0.0,2.0,21,21,8,6.136364,2.159450,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,73.03,4.8
2,23,20,0.0,0.0,3.0,23,23,5,6.666667,2.939199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,82.00,3.4
3,16,14,0.0,0.0,2.0,16,16,6,5.588235,3.049760,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,82.10,3.3
4,25,25,0.0,0.0,0.0,25,25,8,5.807692,2.094301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,90.26,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436,17,14,0.0,0.0,0.0,15,17,7,7.000000,4.537426,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,77.64,5.1
1437,23,18,1.0,0.0,4.0,23,33,5,7.068966,3.814049,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,78.14,4.9
1438,73,58,0.0,1.0,10.0,73,85,26,8.687500,5.622708,...,0.0,6.0,1.0,0.0,0.0,0.0,0.0,3.0,75.50,5.9
1439,27,22,0.0,0.0,4.0,27,29,8,9.068966,4.274471,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,76.52,5.5
