<a href="https://colab.research.google.com/github/qinwenshuo/CVAE-linguistic-transfer/blob/main/3_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up

In [None]:
# features removed:

# All evident related features(10)
# All polarity related features(10)
# All polite related features(14)
# All clusivity related features(10)
# All auxilary verb aspect related features(9)
# All voice distribution features(6)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, random, statistics, math, re, shutil, csv
import networkx as nx
from tqdm import tqdm

In [None]:
explanation_dict = {
    'Arabic': 'ar',
    'English': 'en',
    'Spanish': 'es',
    'Croatian': 'hr',
    'Czech': 'cs',
    'German': 'de',
    'Italian': 'it',
    'Norwegian': 'no',
    'Portuguese': 'pt',
    'Latvian': 'lv',
    'Icelandic': 'is',
    'Finnish': 'fi',
    'Chinese': 'zh',
    'Korean': 'ko'
}


language = input(f"Please choose one language from the list: {list(explanation_dict.keys())}\nEnter your choice: ")
nativeness = input("Native or Learner?\nEnter your choice: ")
input_directory = f'/content/drive/MyDrive/cvae_project/3_pred_format/{language}/{nativeness}/'
subdirectories = os.listdir(input_directory)
subdirectories = [i.replace('.zip', '') for i in subdirectories if i.endswith('.zip')]
corpus = input(f"Please choose one input corpus from the list: {subdirectories}\nEnter your choice: ")
input_directory = os.path.join(input_directory, corpus+'.zip')
corpus_name = corpus.rsplit('_', 3)[0]
raw_text_input_directory = f'/content/drive/MyDrive/cvae_project/1_extract_txt_format/{language}/{nativeness}/{corpus_name}.zip'
output_directory = f'/content/drive/MyDrive/cvae_project/4_dep_features/{language}/{nativeness}/'
os.makedirs(output_directory, exist_ok=True)
output_directory += f'{corpus}.txt'


print()
print(f"Input directory: {input_directory}")
print(f"Input raw text directory: {raw_text_input_directory}")
print(f"Output directory: {output_directory}")


Please choose one language from the list: ['Arabic', 'English', 'Spanish', 'Croatian', 'Czech', 'German', 'Italian', 'Norwegian', 'Portuguese', 'Latvian', 'Icelandic', 'Finnish', 'Chinese', 'Korean']
Enter your choice: English
Native or Learner?
Enter your choice: Learner
Please choose one input corpus from the list: ['ArabCC_machamp_mbert_1', 'ICNALE_machamp_mbert_1', 'WriCLE_machamp_mbert_1', 'ICLE_machamp_mbert_1', 'PELIC_machamp_mbert_1', 'BAWE_machamp_mbert_1', 'TOEFL_machamp_mbert_1', 'Gachon_machamp_mbert_1', 'CLC_machamp_mbert_1', 'WriCLE-informal_machamp_mbert_1']
Enter your choice: Gachon_machamp_mbert_1

Input directory: /content/drive/MyDrive/cvae_project/3_pred_format/English/Learner/Gachon_machamp_mbert_1.zip
Input raw text directory: /content/drive/MyDrive/cvae_project/1_extract_txt_format/English/Learner/Gachon.zip
Output directory: /content/drive/MyDrive/cvae_project/4_dep_features/English/Learner/Gachon_machamp_mbert_1.txt


In [None]:
first_language_filtered = ['Unknown', 'Filipino', 'Other', 'Bosnian-croatian-serbian', 'Pakistan', 'Malay', 'Philippines', 'Taiwanese']

# Morphological feature analysis

In [None]:
function_POS = ['ADP', 'AUX', 'CCONJ', 'DET', 'NUM', 'PART', 'PRON', 'SCONJ']
clauses = ['csubj', 'ccomp', 'xcomp', 'advcl', 'acl']
verb_features = ['Mood', 'Number', 'Person', 'Tense', 'VerbForm']

verb_mood_features = ['Ind', 'Imp', 'Cnd', 'Pot', 'Sub', 'Jus', 'Prp', 'Qot', 'Opt', 'Des', 'Nec', 'Irr', 'Adm']
verb_number_features = ['Sing', 'Plur', 'Dual', 'Tri', 'Pau', 'Grpa', 'Grpl', 'Inv', 'Count', 'Ptan', 'Coll']
verb_tense_features = ['Past', 'Pres', 'Fut', 'Imp', 'Pqp']
verb_form_features = ['Fin', 'Inf', 'Sup', 'Part', 'Conv', 'Gdv', 'Ger', 'Vnoun']
verb_person_features = ['0', '1', '2', '3', '4']
verb_voice_features = ['Act', 'Mid', 'Rcp', 'Pass', 'Antip', 'Lfoc', 'Bfoc', 'Dir', 'Inv', 'Cau']
verb_aspect_features = ['Imp', 'Perf', 'Prosp', 'Prog', 'Hab', 'Iter']

In [None]:
#  reads a sentence from a CoNLL file, validates the format and completeness
#  of the annotations, and returns the sentence as a list of token-annotation pairs.

# I wrote this new method, the previous version wrote by Zoey seems to have
# errors(not able to read the last line in a file)

def read_conllu_file(file_path):
		result = []
		sentence = []

		with open(file_path, 'r', encoding='utf-8') as f:
				for line in f:
						line = line.strip()
						if line == '':
								if sentence:
										result.append(sentence)
										sentence = []
						elif not line.startswith('#'):
								elements = line.split('\t')
								if len(elements) != 10 and len(elements) != 0:
										raise ValueError (f"Error: Line has {len(elements)} elements instead of 10 or 0: {line}")
								elif '-' in elements[0] or '.' in elements[0]:
										print(f'Wired line: {elements}')
								else:
										sentence.append(elements)

		if sentence:
				result.append(sentence)

		return result

In [None]:
def distribution(data_dict):
		"""
		Calculate entropy, standard deviation, and value range of a given dictionary of data,
		considering the distribution of the data values.

		Args:
				data_dict (dict): A dictionary containing data values.

		Returns:
				list: Calculated values for entropy, standard deviation, and value range.
							If there's only one unique value in the data_dict, entropy is 0,
							and std_dev and value_range are placeholders ('').
							If the data_dict is empty, all values in the list are placeholders ('').
		"""
		non_zero_data = {key: value for key, value in data_dict.items() if value != 0}
		if len(non_zero_data) > 1:
				unique_keys = non_zero_data.keys()
				total = 0
				for value in non_zero_data.values():
						if isinstance(value, int):
								total += value
				prob_list = []
				H = 0

				for key in unique_keys:
						prob = non_zero_data[key] / total
						prob_list.append(prob)
						H += -1 * (prob * math.log2(prob))

				std = statistics.stdev(prob_list)
				value_range = max(prob_list) - min(prob_list)
				return H, std, value_range

		if len(non_zero_data) == 1:
				return 0, '', ''

		if len(non_zero_data) == 0:
				return '', '', ''

In [None]:
# checks if a given sentence (sent) contains a subject for a given verb index (verb_index).
def has_subj(verb_index, sent):

	subj = ''

	for tok in sent:
		if tok[7] == 'nsubj' and tok[6] == verb_index:
			subj = tok
			return subj

	return None

In [None]:

def has_obj(verb_index, sent):

	obj = ''

	for tok in sent:
		if tok[7] == 'obj' and tok[6] == verb_index:
			obj = tok
			return obj

	return None

In [None]:
# if a given sentence (sent) contains an indirect object (iobj) for a given verb index (verb_index)
def has_iobj(verb_index, sent):

	iobj = ''

	for tok in sent:
		if tok[7] == 'iobj' and tok[6] == verb_index:
			iobj = tok
			return iobj

	return None

In [None]:
# determines the word order pattern around a given verb index (verb_index) in a given sentence (sent).
def word_order(verb_index, sent):
	order = []

	subj = has_subj(verb_index, sent)
	obj = has_obj(verb_index, sent)
	iobj = has_iobj(verb_index, sent)
	deprel_map = {'nsubj': 's', 'obj': 'o', 'iobj': 'io', 'VERB': 'v'}

	core_list = [int(verb_index)]
	if subj is not None:
		core_list.append(int(subj[0]))
	if obj is not None:
		core_list.append(int(obj[0]))
	if iobj is not None:
		core_list.append(int(iobj[0]))

	core_list.sort()
	for i in range(len(core_list)):
		idx = core_list[i]
		tok = sent[idx - 1]
		if tok[0] == verb_index:
			order.append('v')
		else:
			order.append(deprel_map[tok[7]])

	return order

In [None]:
# identifies subordinate clauses related to a given verb index (verb_index) in a given sentence (sent).
def has_surbordinate(verb_index, sent):

	subordinate = []

	for tok in sent:
		if tok[7] in clauses and tok[6] == verb_index:
			subordinate.append(tok)

	return subordinate

In [None]:
### Get the syntactic dependents of a token ###

def dependents(index, sentence):

	dependent = []

	for tok in sentence:
		if tok[6] == index:
			dependent.append(tok[0])

	if len(dependent) != 0:
		return dependent

	return None

In [None]:
####### Get the subtree of a syntactic head ######

def subtree_generate(index, sentence):

	idxlist = [index]
	min_idx = len(sentence)
	max_idx = 0
	debug_count = 0
	while len(idxlist) != 0:

		i = idxlist.pop()

		if int(i) < min_idx:
			min_idx = int(i)

		if int(i) > max_idx:
			max_idx = int(i)

		i_d = dependents(i, sentence)

		if i_d is not None:
			for d in i_d:
					idxlist.append(d)

	subtree = [row[:] for row in sentence[min_idx - 1 : max_idx]]

	subtree_idx = []

	for idx in range(min_idx - 1, max_idx):
		subtree_idx.append(int(idx))

	subtree_idx.sort()

	return subtree, subtree_idx

In [None]:
def verbal_inflection(index, sentence):
	tok = sentence[int(index) - 1]
	features = tok[5]
	mood_feature = ''
	number_feature = ''
	person_feature = ''
	tense_feature = ''
	form_feature = ''
	voice_feature = ''
	aspect_feature = ''

	if features != 'None':
		verbal_features = features.split('|')
		for feature in verbal_features:
			if feature.startswith('Mood'):
				mood_feature = feature.split('=')[1]
			if feature.startswith('Number'):
				number_feature = feature.split('=')[1]
			if feature.startswith('Person'):
				person_feature = feature.split('=')[1]
			if feature.startswith('Tense'):
				tense_feature = feature.split('=')[1]
			if feature.startswith('VerbForm'):
				form_feature = feature.split('=')[1]
			if feature.startswith('Aspect'):
				aspect_feature = feature.split('=')[1]
			if feature.startswith('Voice'):
				voice_feature = feature.split('=')[1]

	return mood_feature, number_feature, person_feature, tense_feature, form_feature, aspect_feature, voice_feature
	# return mood_feature, number_feature, person_feature, tense_feature, form_feature, aspect_feature, voice_feature, evident_feature, polarity_feature, polite_feature, clusivity_feature


In [None]:
# calculates the ratio of each value in a given dictionary to the total sum of all values.
def dictionary_ratio(dictionary):

	total = 0
	for k, v in dictionary.items():
		total += v

	if total != 0:
		for k, v in dictionary.items():
			dictionary[k] = v / total

	return dictionary

In [None]:
# performs an analysis on a graph g representing a dependency tree.
def analyze(g):
	np_words = set()
	doms = {}

	# for each node in g, get list of dominating nodes
	for node in g:
		doms[node] = nx.shortest_path(g, source = 0, target = node)

	# for each edge (u,v) in g, determine whether all intervening words (ids) are dominated by u
	for edge in g.edges():
		u = edge[0]
		v = edge[1]
		i = 1
		if u > v:
			i = -1
		for n in range(u+i,v,i):
			if u not in doms[n]:
				np_words.add(n)

	n = len(np_words)

	# for calculating dependency tree depth
	len_list = []

	for k, v in doms.items():
		len_list.append(len(v))

	len_list = list(set(len_list))
	temp_max_depth = max(len_list)

	if temp_max_depth == 2:
		max_depth = 1
	else:
		max_depth = temp_max_depth - 2

	# return whether this tree has a non-projective dependency and number of non-projective words (ids)
	return int(n > 0), n, max_depth


In [None]:
def has_cycle(sent):
    graph = nx.DiGraph()

    for tok in sent:
        graph.add_edge(int(tok[6]), int(tok[0]))

    try:
        cycle = nx.find_cycle(graph, orientation='original')
        return True
    except nx.NetworkXNoCycle:
        return graph

In [None]:
def update_dictionary(dictionary, key, increment=1, new_item=True):
    if key in dictionary:
        dictionary[key] += increment
    else:
        if new_item == True:
            dictionary[key] = increment


def morphological_feature_analysis(file_path):

		num_sent, num_word, num_word_type, num_function_word, num_function_word_type = 0, 0, 0, 0, 0
		num_lexical_density, num_lemma_type, num_pos_type, ttr_word, ttr_lemma = 0, 0, 0, 0, 0
		ave_sent_len, ave_word_len = 0, 0

		word_list, word_len_list, lemma_list, lemma_len_list = [], [], [], []
		deplen_list = []
		verb_word_dict, verb_lemma_dict = {}, {}
		aux_word_dict, aux_lemma_dict = {}, {}
		function_word_dict, function_lemma_dict = {}, {}
		pos_dict = {}
		deprel_dict = {}
		headedness_dict = {'final': 0, 'initial': 0}
		subordinate_deprel_dict, all_subordinate_order_dict = {}, {}

		clause_len_list = []
		all_word_order_dict = {}

		word_order_dict = {'s_v': 0, 'v_s': 0, 'v_o': 0, 'o_v': 0}
		subordinate_order_dictionary = {'subordinate_head_initial': 0, 'subordinate_head_final': 0}
		valency_dict = {'intransitive': 0, 'transitive': 0, 'ditransitive': 0, 'v_io': 0}

		def dictionary_initialization(feature_name, features_category_list):
				dictionary = {}
				for feature in features_category_list:
						dictionary[feature_name + feature] = 0
				return dictionary

		verb_mood_dict = dictionary_initialization('verb_mood_', verb_mood_features)
		verb_number_dict = dictionary_initialization('verb_number_', verb_number_features)
		verb_tense_dict = dictionary_initialization('verb_tense_', verb_tense_features)
		verb_aspect_dict = dictionary_initialization('verb_aspect_', verb_aspect_features)
		verb_voice_dict = dictionary_initialization('verb_voice_', verb_voice_features)
		verb_form_dict = dictionary_initialization('verb_form_', verb_form_features)
		verb_person_dict = dictionary_initialization('verb_person_', verb_person_features)

		aux_mood_dict = dictionary_initialization('aux_mood_', verb_mood_features)
		aux_number_dict = dictionary_initialization('aux_number_', verb_number_features)
		aux_tense_dict = dictionary_initialization('aux_tense_', verb_tense_features)
		aux_voice_dict = dictionary_initialization('aux_voice_', verb_voice_features)
		aux_form_dict = dictionary_initialization('aux_form_', verb_form_features)
		aux_person_dict = dictionary_initialization('aux_person_', verb_person_features)

		non_projective_sent, non_projective_word, total_depth = 0, 0, 0
		subordinate_non_projective_sent, subordinate_non_projective_word, subordinate_total_depth = 0, 0, 0

		lg = ''

		sentences = read_conllu_file(file_path)

		for sent in sentences:
				if len(sent) >= 5:
						if lg != '' and lg != sent[0][-1]:
								raise ValueError(f'Inconsistant native language {lg} != {sent[0][-1]} in file: {file_path}')
						elif lg == '':
								lg = sent[0][-1]

						G = has_cycle(sent)
						if G == True:
								raise ValueError('cyclic dependency tree')
						ns, nw, max_depth = 0, 0, 0
						for tok in sent:
								G.add_edge(int(tok[6]), int(tok[0])) # 6: head; 0: word index
						try:
								ns, nw, max_depth = analyze(G)
						except:
								print(len(sent))
								print(file_name, ' '.join(tok[1] for tok in sent), 'cannot analyze!!')
						non_projective_sent += ns
						non_projective_word += nw
						total_depth += max_depth

						num_sent += 1
						num_word += len(sent)

						for tok in sent:
								word_list.append(tok[1])
								word_len_list.append(len(tok[1]))
								lemma_list.append(tok[2])
								lemma_len_list.append(len(tok[2]))
								update_dictionary(pos_dict, tok[3])
								update_dictionary(deprel_dict, tok[7])

								if int(tok[6]) != 0:
										deplen_list.append(abs(int(tok[6]) - int(tok[0])))
										if int(tok[6]) > int(tok[0]):
												update_dictionary(headedness_dict, 'final')
										else:
												update_dictionary(headedness_dict, 'initial')
								### collecting funcational words using pos tags
								if tok[3] in function_POS:
										num_function_word += 1
										update_dictionary(function_word_dict, tok[1])
										update_dictionary(function_lemma_dict, tok[2])
								### collecting verbal inflectional features
								if tok[3] == 'VERB':
										update_dictionary(verb_word_dict, tok[1])
										update_dictionary(verb_lemma_dict, tok[2])
										mood_feature, number_feature, person_feature, tense_feature, form_feature, aspect_feature, voice_feature = verbal_inflection(tok[0], sent)

										update_dictionary(verb_mood_dict, 'verb_mood_' + mood_feature, new_item=False)
										update_dictionary(verb_number_dict, 'verb_number_' + number_feature, new_item=False)
										update_dictionary(verb_person_dict, 'verb_person_' + person_feature, new_item=False)
										update_dictionary(verb_tense_dict, 'verb_tense_' + tense_feature, new_item=False)
										update_dictionary(verb_form_dict, 'verb_form_' + form_feature, new_item=False)
										update_dictionary(verb_aspect_dict, 'verb_aspect_' + aspect_feature, new_item=False)
										update_dictionary(verb_voice_dict, 'verb_voice_' + voice_feature, new_item=False)

										### collecting valency features
										order = word_order(tok[0], sent)
										if 'o' in order and 'io' in order:
												valency_dict['ditransitive'] += 1
										if 'o' in order and 'io' not in order:
												valency_dict['transitive'] += 1
										if 'o' not in order and 'io' in order:
												valency_dict['v_io'] += 1
										if 'o' not in order and 'io' not in order:
												valency_dict['intransitive'] += 1

										if 's' in order and 'v' in order:
												if order.index('s') < order.index('v'):
														word_order_dict['s_v'] += 1
												elif order.index('s') > order.index('v'):
														word_order_dict['v_s'] += 1

										if 'o' in order and 'v' in order:
												if order.index('o') < order.index('v'):
														word_order_dict['o_v'] += 1
												elif order.index('o') > order.index('v'):
														word_order_dict['v_o'] += 1
										update_dictionary(all_word_order_dict, '_'.join(w for w in order))

								if tok[3] == 'AUX':
										update_dictionary(aux_word_dict, tok[1])
										update_dictionary(aux_lemma_dict, tok[2])
										mood_feature, number_feature, person_feature, tense_feature, form_feature, aspect_feature, voice_feature = verbal_inflection(tok[0], sent)

										update_dictionary(aux_mood_dict, 'aux_mood_' + mood_feature, new_item=False)
										update_dictionary(aux_number_dict, 'aux_number_' + number_feature, new_item=False)
										update_dictionary(aux_person_dict, 'aux_person_' + person_feature, new_item=False)
										update_dictionary(aux_tense_dict, 'aux_tense_' + tense_feature, new_item=False)
										update_dictionary(aux_form_dict, 'aux_form_' + form_feature, new_item=False)
										update_dictionary(aux_voice_dict, 'aux_voice_' + voice_feature, new_item=False)
								if tok[7] in clauses:
										update_dictionary(subordinate_deprel_dict, tok[7])

										head = sent[int(tok[6]) - 1]
										if int(tok[6]) > int(tok[0]):
												update_dictionary(all_subordinate_order_dict, tok[7] + '_' + head[7])
												subordinate_order_dictionary['subordinate_head_final'] += 1
										else:
												update_dictionary(all_subordinate_order_dict, head[7] + '_' + tok[7])
												subordinate_order_dictionary['subordinate_head_initial'] += 1

										subordinate_ns, subordinate_nw, subordinate_max_depth = 0, 0, 0

										clause_subtree, clause_subtree_idx = subtree_generate(tok[0], sent)
										##### re-indexing the clause dependency index
										root = int(clause_subtree[0][0])-1
										lim = len(clause_subtree)
										# print(root)
										for i in range(len(clause_subtree)):
												clause_subtree[i][0] = str(int(clause_subtree[i][0]) - root)
												clause_subtree[i][6] = str(max(0, int(clause_subtree[i][6]) - root))
												if int(clause_subtree[i][6]) > lim:
														clause_subtree[i][6] = '0'
										#####
										clause_len_list.append(len(clause_subtree))
										subordinate_G = nx.DiGraph()
										for z in clause_subtree:
												subordinate_G.add_edge(int(z[6]), int(z[0]))
										subordinate_ns, subordinate_nw, subordinate_max_depth = 0, 0, 0
										try:
												subordinate_ns, subordinate_nw, subordinate_max_depth = analyze(subordinate_G)
										except:
												print(file_name, ' '.join(tok[1] for tok in sent), 'cannot analyze clause')
										subordinate_non_projective_sent += subordinate_ns
										subordinate_non_projective_word += subordinate_nw
										subordinate_total_depth += subordinate_max_depth

		def sum_dictionary_values(dictionary):
				total = 0
				for value in dictionary.values():
						if isinstance(value, int):
								total += value
				return total

		if num_word == 0:
				return None, None
		num_word_type = len(set(word_list))
		function_word_type = len(function_word_dict) / num_word_type
		content_word = num_word - num_function_word
		content_word_type = 1 - function_word_type
		lexical_density = content_word / num_word
		function_word_H, function_word_std, function_word_range = distribution(function_word_dict)
		function_lemma_H, function_lemma_std, function_lemma_range = distribution(function_lemma_dict)
		num_lemma_type = len(set(lemma_list))
		function_lemma_type = len(function_lemma_dict) / num_lemma_type
		content_lemma_type = 1 - function_lemma_type
		num_pos_type = len(pos_dict) / num_word
		pos_H, pos_std, pos_range = distribution(pos_dict)
		ttr_word = num_word_type / num_word
		ttr_lemma = num_lemma_type / num_word
		ave_sent_len = num_word / num_sent
		ave_word_len = sum(word_len_list) / num_word
		ave_lemma_len = sum(lemma_len_list) / num_word

		verb_word_H, verb_word_std, verb_word_range = distribution(verb_word_dict)
		verb_lemma_H, verb_lemma_std, verb_lemma_range = distribution(verb_lemma_dict)
		verb_mood_H, verb_mood_std, verb_mood_range = distribution(verb_mood_dict)
		verb_number_H, verb_number_std, verb_number_range = distribution(verb_number_dict)
		verb_person_H, verb_person_std, verb_person_range = distribution(verb_person_dict)
		verb_tense_H, verb_tense_std, verb_tense_range = distribution(verb_tense_dict)
		verb_form_H, verb_form_std, verb_form_range = distribution(verb_form_dict)
		verb_aspect_H, verb_aspect_std, verb_aspect_range = distribution(verb_aspect_dict)

		verb_valency_H, verb_valency_std, verb_valency_range = distribution(valency_dict)

		aux_word_H, aux_word_std, aux_word_range = distribution(aux_word_dict)
		aux_lemma_H, aux_lemma_std, aux_lemma_range = distribution(aux_lemma_dict)
		aux_mood_H, aux_mood_std, aux_mood_range = distribution(aux_mood_dict)
		aux_number_H, aux_number_std, aux_number_range = distribution(aux_number_dict)
		aux_person_H, aux_person_std, aux_person_range = distribution(aux_person_dict)
		aux_tense_H, aux_tense_std, aux_tense_range = distribution(aux_tense_dict)
		aux_form_H, aux_form_std, aux_form_range = distribution(aux_form_dict)

		deprel_H , deprel_std, deprel_range = distribution(deprel_dict)

		subordinate_deprel_H , subordinate_deprel_std, subordinate_deprel_range = distribution(subordinate_deprel_dict)
		subordinate_order_H, subordinate_order_std, subordinate_order_range = distribution(all_subordinate_order_dict)

		head_finality = headedness_dict['final'] / (headedness_dict['final'] + headedness_dict['initial'])

		word_order_H, word_order_std, word_order_range = distribution(all_word_order_dict)

		ave_dep_len = sum(deplen_list) / len(deplen_list)
		ave_clause_len = 0
		if len(clause_len_list) > 0:
				ave_clause_len = sum(clause_len_list) / len(clause_len_list)
		else:
				ave_clause_len = 0
		non_projective_sent_ratio = non_projective_sent / num_sent
		non_projective_word_ratio = non_projective_word / num_word
		ave_tree_depth = total_depth / num_sent

		subordinate_non_projective_sent_ratio = 0
		if len(clause_len_list) > 0:
				subordinate_non_projective_sent_ratio = subordinate_non_projective_sent / len(clause_len_list)
		else:
				subordinate_non_projective_sent_ratio = 0
		if sum(clause_len_list) > 0:
				subordinate_non_projective_word_ratio = subordinate_non_projective_word / sum(clause_len_list)
		else:
				subordinate_non_projective_word_ratio = 0
		if len(clause_len_list) > 0:
				subordinate_ave_tree_depth = subordinate_total_depth / len(clause_len_list)
		else:
				subordinate_ave_tree_depth = 0

		verb_ratio = sum_dictionary_values(verb_word_dict) / num_sent

		verb_features = [verb_mood_dict, verb_number_dict, verb_person_dict, verb_tense_dict, verb_form_dict, verb_aspect_dict, verb_voice_dict]
		aux_features = [aux_mood_dict, aux_number_dict, aux_person_dict, aux_tense_dict, aux_form_dict, aux_voice_dict]
		all_info = [ave_sent_len, ttr_word, ave_word_len,
						function_word_type, function_word_H, function_word_std, function_word_range,
						lexical_density, ttr_lemma, ave_lemma_len, function_lemma_type,
						function_lemma_H, function_lemma_std, function_lemma_range,
						num_pos_type, pos_H, pos_std, pos_range,
						verb_word_H, verb_word_std, verb_word_range,
						verb_lemma_H, verb_lemma_std, verb_lemma_range,
						verb_mood_H, verb_mood_std, verb_mood_range,
						verb_number_H, verb_number_std, verb_number_range,
						verb_person_H, verb_person_std, verb_person_range,
						verb_tense_H, verb_tense_std, verb_tense_range,
						verb_form_H, verb_form_std, verb_form_range,
						verb_valency_H, verb_valency_std, verb_valency_range,
						verb_aspect_H, verb_aspect_std, verb_aspect_range,
						aux_word_H, aux_word_std, aux_word_range,
						aux_lemma_H, aux_lemma_std, aux_lemma_range,
						aux_mood_H, aux_mood_std, aux_mood_range,
						aux_number_H, aux_number_std, aux_number_range,
						aux_person_H, aux_person_std, aux_person_range,
						aux_tense_H, aux_tense_std, aux_tense_range,
						aux_form_H, aux_form_std, aux_form_range,
						deprel_H , deprel_std, deprel_range,
						subordinate_deprel_H, subordinate_deprel_std, subordinate_deprel_range,
						subordinate_order_H, subordinate_order_std, subordinate_order_range,
						head_finality, verb_valency_H, verb_valency_std, verb_valency_range,
						word_order_H, word_order_std, word_order_range, ave_dep_len, ave_clause_len,
						non_projective_sent_ratio, non_projective_word_ratio, ave_tree_depth,
						subordinate_non_projective_sent, subordinate_non_projective_word_ratio, subordinate_ave_tree_depth,
						verb_ratio]

		word_order_dictionary = dictionary_ratio(word_order_dict)
		for k, v in word_order_dictionary.items():
				all_info.append(v)

		subordinate_order_dictionary = dictionary_ratio(subordinate_order_dictionary)
		for k, v in subordinate_order_dictionary.items():
				all_info.append(v)

		for feature_dictionary in verb_features:
				feature_dictionary = dictionary_ratio(feature_dictionary)
				for k, v in feature_dictionary.items():
						all_info.append(v)

		for feature_dictionary in aux_features:
				feature_dictionary = dictionary_ratio(feature_dictionary)
				for k, v in feature_dictionary.items():
						all_info.append(v)


		return lg, all_info

# doc2vec topic embedding (only for English)

In [None]:
!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.3.0 -s 5.0.0 -g

--2023-08-31 18:26:44--  https://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2023-08-31 18:26:44--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’

-                     0%[                    ]       0  --.-KB/s               Installing PySpark 3.3.0 and Spark NLP 5.0.0
setup Colab for PySpark 3.3.0 

In [None]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

import pandas as pd
import numpy as np
import json

import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [None]:
spark = sparknlp.start()# for GPU training >> sparknlp.start(gpu = True)

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 5.0.0
Apache Spark version: 3.3.0


In [None]:
document = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

token = Tokenizer()\
.setInputCols("document")\
.setOutputCol("token")

norm = Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalized")\
.setLowercase(True)

stops = StopWordsCleaner.pretrained()\
.setInputCols("normalized")\
.setOutputCol("cleanedToken")

doc2Vec = Doc2VecModel.pretrained("doc2vec_gigaword_wiki_300", "en")\
.setInputCols("cleanedToken")\
.setOutputCol("sentence_embeddings")

nlpPipeline = Pipeline(
    stages=[
      document,
      token,
      norm,
      stops,
      doc2Vec
      ])

stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[OK!]
doc2vec_gigaword_wiki_300 download started this may take some time.
Approximate size to download 312.3 MB
[OK!]


In [None]:
raw_text_input_extraction_path = f'/content/unzipped_inputs/raw_text/{corpus_name}'
shutil.unpack_archive(raw_text_input_directory, raw_text_input_extraction_path, 'zip')


def extract_topic_embedding(file_handle):
    file_name = file_handle.replace('.pred', '.txt')
    file_exist = False
    if nativeness == 'Native':
        file_path = os.path.join(raw_text_input_extraction_path, file_name)
        if os.path.exists(file_path):
            file_exist = True
    elif nativeness == 'Learner':
        for dir in os.listdir(raw_text_input_extraction_path):
            language_path = os.path.join(raw_text_input_extraction_path, dir)
            file_path = os.path.join(language_path, file_name)
            if os.path.exists(file_path):
                file_exist = True
                break

    if file_exist:
        with open(file_path, 'r') as text_file:
            file_content = [text_file.read()]
        df = spark.createDataFrame(file_content, StringType()).toDF("text")
        result = nlpPipeline.fit(df).transform(df)
        new_df = result.select(F.explode(F.arrays_zip(result.document.result,
                                             result.sentence_embeddings.embeddings)).alias("cols")) \
              .select(F.expr("cols['0']").alias("document"),
                      F.expr("cols['1']").alias("embeddings"))

        # Collect the rows from the DataFrame as a list of Row objects
        rows = new_df.collect()
        for row in rows:
            embeddings = row.embeddings # only 1 row
        return embeddings
    else:
        raise ValueError(f'Input file: {file_path} not found')

# lang2vec

In [None]:
!pip install iso-639
from iso639 import languages

def loading_packages():
    !git clone https://github.com/antonisa/lang2vec
    directory = '/content/lang2vec/'
    os.chdir(directory)
    !python3 /content/lang2vec/setup.py install
    import lang2vec.lang2vec as l2v

Collecting iso-639
  Downloading iso-639-0.4.5.tar.gz (167 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.4/167.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: iso-639
  Building wheel for iso-639 (setup.py) ... [?25l[?25hdone
  Created wheel for iso-639: filename=iso_639-0.4.5-py3-none-any.whl size=168842 sha256=6715ae64ff6a1ada28ce61da4ae13a4fbacb3708d7d49e352469262c4b2c66ca
  Stored in directory: /root/.cache/pip/wheels/d8/78/cc/5478ca3b1c3f602eae6f8cdbd78f909c0a0bfa0bbcb5c7771f
Successfully built iso-639
Installing collected packages: iso-639
Successfully installed iso-639-0.4.5


In [None]:
# delete unnecessary temporary directory
def delete_directory(directory):
    for root, dirs, files in os.walk(directory, topdown=False):
        for file in files:
            file_path = os.path.join(root, file)
            os.remove(file_path)
        for dir in dirs:
            dir_path = os.path.join(root, dir)
            os.rmdir(dir_path)
    os.rmdir(directory)


def unzip_read(zip_file):
    # unzip input files
    l1s = {}
    input_extraction_path = f'/content/temp_unzipped'
    shutil.unpack_archive(zip_file, input_extraction_path, 'zip')
    for dir in os.listdir(input_extraction_path):
        if os.path.isdir(os.path.join(input_extraction_path, dir)):
            file_num = len(os.listdir(os.path.join(input_extraction_path, dir)))
            if dir in l1s:
                l1s[dir] += file_num
            else:
                l1s[dir] = file_num

    delete_directory(input_extraction_path)
    return l1s

In [None]:
def get_lg_code(lg_name):
    try:
        lg = languages.get(name=lg_name)
    except KeyError:
        return f'Not found ISO-639-3 language code for {lg_name}'
    return lg.part3

In [None]:
L2_code = get_lg_code(language)

In [None]:
first_language_filtered = ['Unknown', 'Filipino', 'Other', 'Bosnian-croatian-serbian', 'Pakistan', 'Malay', 'Philippines', 'Taiwanese']

In [None]:
def language_distance(first_language):
    if first_language not in first_language_filtered:
        if first_language == "Chinese-Mandarin":
            first_language = 'Mandarin Chinese'
        if first_language == "Chinese-Cantonese":
            first_language = 'Yue Chinese'
        if first_language == 'Russian, Ukrainian':
            first_language = 'Ukrainian'
        if first_language == 'Aromanian (Vlach)':
            first_language = 'Aromanian'
        if first_language == 'Greek':
            first_language = 'Modern Greek (1453-)'
        if first_language == 'Farsi':
            first_language = 'Persian'
        if first_language == 'Swahili':
            first_language = 'Swahili (macrolanguage)'
        L1_code = get_lg_code(first_language)
        if L1_code.startswith('Not found ISO-639-3 language code for '):
            print(f'{first_language}: {L1_code}, essays with this L1 were skipped')
        else:
            featural_score = l2v.distance('featural', L1_code, L2_code)
            phonological_score = l2v.distance('phonological', L1_code, L2_code)
            syntactic_score = l2v.distance('syntactic', L1_code, L2_code)

            return [featural_score, phonological_score, syntactic_score]
    return None

In [None]:
pre_computed_lang2vec = f'/content/drive/MyDrive/cvae_project/codes/lang2vec/{language}.csv'
if not os.path.exists(pre_computed_lang2vec):
    loading_packages()
    print(f'No pre-computed lang2vec values prepared for {language}\nPreparing a look up csv file to speed up the codes...')
    corpus_dir = f'/content/drive/MyDrive/cvae_project/1_extract_txt_format/{language}/Learner/'

    corpus_l1s = {}
    for zip_file in os.listdir(corpus_dir):
        if zip_file.endswith('.zip'):
            zip_path = os.path.join(corpus_dir, zip_file)
            l1s = unzip_read(zip_path)
            corpus_l1s[zip_file.replace('.zip', '')] = l1s

    language_list = []
    for zip_file, l1s in corpus_l1s.items():
        print(f'{zip_file}: {l1s}')
        for l1 in l1s:
            if l1 not in language_list:
                language_list.append(l1)

    with open(pre_computed_lang2vec, mode='w', newline='') as csv_file:
        fieldnames = ['L1 languages', 'featural scores', 'phonological scores', 'syntactic scores']
        writer = csv.writer(csv_file)
        writer.writerow(fieldnames)

        for lg in tqdm(language_list):
            new_data = [lg, 0, 0, 0]
            scores = language_distance(lg)
            if scores != None:
                new_data[1] = scores[0]
                new_data[2] = scores[1]
                new_data[3] = scores[2]

                writer.writerow(new_data)
    print("\nCSV file created.")


In [None]:
language_distance_scores = {}

with open(pre_computed_lang2vec, mode='r') as file:
    reader = csv.DictReader(file)

    for row in reader:
        language = row['L1 languages']
        scores = [float(row['featural scores']), float(row['phonological scores']), float(row['syntactic scores'])]
        language_distance_scores[language] = scores

In [None]:
language_distance_scores

{'Arabic': [0.6, 0.5687, 0.57],
 'Tswana': [1.0, 0.5687, 1.0],
 'Greek': [0.5, 0.195, 0.52],
 'Japanese': [0.6, 0.5032, 0.66],
 'Polish': [0.5, 0.2804, 0.59],
 'Chinese': [0.6, 0.5687, 0.57],
 'Persian': [0.6, 0.5687, 0.57],
 'Swedish': [0.5, 0.5687, 0.42],
 'Russian': [0.5, 0.2804, 0.49],
 'Macedonian': [0.6, 0.5687, 0.57],
 'Korean': [0.5, 0.4638, 0.62],
 'Czech': [0.6, 0.5687, 0.66],
 'Aromanian (Vlach)': [0.6, 0.5687, 0.57],
 'Dutch': [0.5, 0.5687, 0.49],
 'Spanish': [0.5, 0.3433, 0.4],
 'Punjabi': [0.6, 0.5687, 0.65],
 'Norwegian': [0.8, 0.5687, 0.59],
 'Bulgarian': [0.5, 0.2804, 0.48],
 'Italian': [0.5, 0.5687, 0.51],
 'Hungarian': [0.5, 0.3433, 0.6],
 'Chinese-Mandarin': [0.6, 0.39, 0.55],
 'Lithuanian': [0.5, 0.3498, 0.68],
 'Urdu': [0.6, 0.5687, 0.67],
 'German': [0.4, 0.3277, 0.42],
 'Chinese-Cantonese': [0.5, 0.5348, 0.59],
 'Turkish': [0.6, 0.3433, 0.7],
 'Serbian': [0.8, 0.8632, 0.62],
 'French': [0.5, 0.427, 0.46],
 'Portuguese': [0.5, 0.5687, 0.47],
 'Bosnian': [0.8, 0.5

# Putting together

In [None]:
morph_header = [
 'ave_sent_len', 'ttr_word', 'ave_word_len',
 'function_word_type', 'function_word_H', 'function_word_std', 'function_word_range',
 'lexical_density', 'ttr_lemma', 'ave_lemma_len',
 'function_lemma_type', 'function_lemma_H', 'function_lemma_std', 'function_lemma_range',
 'num_pos_type', 'pos_H', 'pos_std', 'pos_range',
 'verb_word_H', 'verb_word_std', 'verb_word_range',
 'verb_lemma_H', 'verb_lemma_std', 'verb_lemma_range',
 'verb_mood_H', 'verb_mood_std', 'verb_mood_range',
 'verb_number_H', 'verb_number_std', 'verb_number_range',
 'verb_person_H', 'verb_person_std', 'verb_person_range',
 'verb_tense_H', 'verb_tense_std', 'verb_tense_range',
 'verb_form_H', 'verb_form_std', 'verb_form_range',
 'verb_valency_H', 'verb_valency_std', 'verb_valency_range',
 'verb_aspect_H', 'verb_aspect_std', 'verb_aspect_range',
 'aux_word_H', 'aux_word_std', 'aux_word_range',
 'aux_lemma_H', 'aux_lemma_std', 'aux_lemma_range',
 'aux_mood_H', 'aux_mood_std', 'aux_mood_range',
 'aux_number_H', 'aux_number_std', 'aux_number_range',
 'aux_person_H', 'aux_person_std', 'aux_person_range',
 'aux_tense_H', 'aux_tense_std', 'aux_tense_range',
 'aux_form_H', 'aux_form_std', 'aux_form_range',
 'deprel_H' , 'deprel_std', 'deprel_range',
 'subordinate_deprel_H', 'subordinate_deprel_std', 'subordinate_deprel_range',
 'subordinate_order_H', 'subordinate_order_std', 'subordinate_order_range','head_finality',
 'verb_valency_H', 'verb_valency_std', 'verb_valency_range',
 'word_order_H', 'word_order_std', 'word_order_range',
 'ave_dep_len', 'ave_clause_len',
 'non_projective_sent_ratio', 'non_projective_word_ratio', 'ave_tree_depth',
 'subordinate_non_projective_sent', 'subordinate_non_projective_word_ratio',
 'subordinate_ave_tree_depth', 'verb_raio',
 's_v', 'v_s', 'v_o', 'o_v',
 'subordinate_head_initial', 'subordinate_head_final',
 'verb_mood_Ind', 'verb_mood_Imp', 'verb_mood_Cnd', 'verb_mood_Pot', 'verb_mood_Sub',
 'verb_mood_Jus',  'verb_mood_Prp', 'verb_mood_Qot', 'verb_mood_Opt', 'verb_mood_Des',
 'verb_mood_Nec', 'verb_mood_Irr', 'verb_mood_Adm',
 'verb_number_Sing', 'verb_number_Plur', 'verb_number_Dual', 'verb_number_Tri',
 'verb_number_Pau', 'verb_number_Grpa', 'verb_number_Grpl', 'verb_number_Inv',
 'verb_number_Count', 'verb_number_Ptan', 'verb_number_Coll',
 'verb_person_0', 'verb_person_1', 'verb_person_2', 'verb_person_3', 'verb_person_4',
 'verb_tense_Past', 'verb_tense_Pres', 'verb_tense_Fut', 'verb_tense_Imp', 'verb_tense_Pqp',
 'verb_form_Fin', 'verb_form_Inf', 'verb_form_Sup', 'verb_form_Part', 'verb_form_Conv',
 'verb_form_Gdv', 'verb_form_Ger', 'verb_form_Vnoun',
 'verb_aspect_Imp', 'verb_aspect_Perf', 'verb_aspect_Prosp', 'verb_aspect_Prog',
 'verb_aspect_Hab', 'verb_aspect_Iter',
 'verb_voice_Act', 'verb_voice_Mid', 'verb_voice_Rcp', 'verb_voice_Pass', 'verb_voice_Antip',
 'verb_voice_Lfoc', 'verb_voice_Bfoc', 'verb_voice_Dir', 'verb_voice_Inv', 'verb_voice_Cau',
 'aux_mood_Ind', 'aux_mood_Imp', 'aux_mood_Cnd', 'aux_mood_Pot', 'aux_mood_Sub',
 'aux_mood_Jus', 'aux_mood_Prp', 'aux_mood_Qot', 'aux_mood_Opt', 'aux_mood_Des',
 'aux_mood_Nec', 'aux_mood_Irr', 'aux_mood_Adm',
 'aux_number_Sing', 'aux_number_Plur', 'aux_number_Dual', 'aux_number_Tri', 'aux_number_Pau',
 'aux_number_Grpa', 'aux_number_Grpl', 'aux_number_Inv', 'aux_number_Count', 'aux_number_Ptan',
 'aux_number_Coll',
 'aux_person_0', 'aux_person_1', 'aux_person_2', 'aux_person_3', 'aux_person_4',
 'aux_tense_Past', 'aux_tense_Pres', 'aux_tense_Fut', 'aux_tense_Imp', 'aux_tense_Pqp',
 'aux_form_Fin', 'aux_form_Inf', 'aux_form_Sup', 'aux_form_Part', 'aux_form_Conv',
 'aux_form_Gdv', 'aux_form_Ger', 'aux_form_Vnoun',
 'aux_voice_Act', 'aux_voice_Mid', 'aux_voice_Rcp', 'aux_voice_Pass', 'aux_voice_Antip',
 'aux_voice_Lfoc', 'aux_voice_Bfoc', 'aux_voice_Dir', 'aux_voice_Inv', 'aux_voice_Cau']

doc2vec_header = []
for i in range(1,301):
    doc2vec_header.append(f'doc2vec_{i}')

lang2vec_header = 'featural, phonological, syntactic'
header = ', '.join(morph_header) + '|L1|' + ', '.join(doc2vec_header) + '|' + lang2vec_header + '\n'
print(header)

ave_sent_len, ttr_word, ave_word_len, function_word_type, function_word_H, function_word_std, function_word_range, lexical_density, ttr_lemma, ave_lemma_len, function_lemma_type, function_lemma_H, function_lemma_std, function_lemma_range, num_pos_type, pos_H, pos_std, pos_range, verb_word_H, verb_word_std, verb_word_range, verb_lemma_H, verb_lemma_std, verb_lemma_range, verb_mood_H, verb_mood_std, verb_mood_range, verb_number_H, verb_number_std, verb_number_range, verb_person_H, verb_person_std, verb_person_range, verb_tense_H, verb_tense_std, verb_tense_range, verb_form_H, verb_form_std, verb_form_range, verb_valency_H, verb_valency_std, verb_valency_range, verb_aspect_H, verb_aspect_std, verb_aspect_range, aux_word_H, aux_word_std, aux_word_range, aux_lemma_H, aux_lemma_std, aux_lemma_range, aux_mood_H, aux_mood_std, aux_mood_range, aux_number_H, aux_number_std, aux_number_range, aux_person_H, aux_person_std, aux_person_range, aux_tense_H, aux_tense_std, aux_tense_range, aux_form_H, 

In [None]:
# unzip input files
input_extraction_path = f'/content/unzipped_inputs/{corpus}'
shutil.unpack_archive(input_directory, input_extraction_path, 'zip')

In [None]:
file_list = [filename for filename in os.listdir(input_extraction_path) if filename.endswith('.pred')]
file_list.sort()

header_temp = header.split(', ')

with open(output_directory, 'w', encoding='utf-8') as outfile:
    outfile.write(header)
    for file_name in tqdm(file_list):
        if file_name.endswith('.pred'):
            lg, all_info = morphological_feature_analysis(os.path.join(input_extraction_path, file_name))
            if lg not in first_language_filtered and all_info != None :
                if len(all_info) != len(header.split('|')[0].split(', ')):
                    raise ValueError ('feature lenth not compatible')
                if nativeness == 'Native':
                    lang2vec = [0, 0, 0]
                else:
                    try:
                        lang2vec = language_distance_scores[lg]
                    except KeyError:
                        print(f'essay having {lg} as its first language was skipped')

                doc2vec = extract_topic_embedding(file_name)
                # write morphological features
                outfile.write(', '.join(str(w) for w in all_info) + '|')
                # write first language
                outfile.write(lg + '|')
                # write doc2vec embedding
                outfile.write(', '.join(str(w) for w in doc2vec) + '|')
                # write lang2vec distance scores
                outfile.write(', '.join(str(w) for w in lang2vec) + '\n')


  1%|          | 167/25072 [02:14<5:33:37,  1.24it/s]

In [None]:
# Call the function to clear the /content/ directory after using the unzipped files
def clear_content_directory():
    content_dir = '/content/'
    exempt_folders = ['drive', 'sample_data']

    for item in os.listdir(content_dir):
        item_path = os.path.join(content_dir, item)
        if os.path.isdir(item_path) and item not in exempt_folders:
            for root, dirs, files in os.walk(item_path, topdown=False):
                for file in files:
                    file_path = os.path.join(root, file)
                    os.remove(file_path)
                for dir in dirs:
                    dir_path = os.path.join(root, dir)
                    os.rmdir(dir_path)
            os.rmdir(item_path)
        elif os.path.isfile(item_path):
            os.remove(item_path)

# clear temporary /content/ dir
clear_content_directory()