## Put this .ipynb file anywhere in your computer. Under the same directory, create a new folder and put the two text files 'words-surf.txt' and 'words-SIG.txt' into this new folder

## Notes
### words-SIG.txt
1. If there is no ';' (semicolon) in a parse (meaning there's no morphology in this word): remove this parse and its corresponding word in 'words-surf.txt'

### words-surf.txt
2. '%0%WORD%' (e.g., '%0%клубдаяй%' in Legzi-QusarDialect): remove this word
> reason: no template? Ask Beth
3. '1.1' (e.g., 'Билесувардиз1.1яр' in Legzi-QusarDialect): remove this word
> reason: no root? Ask Beth
4. '{}' (denoting a missing gloss, e.g., 'áíhapi̱siyhobáchilííchi̱ki̱li{}ho̱' in Choctaw): remove this word
5. '}' in Sena3: remove this word
> reason: ?
6. ',' (caused by the messy data in the database, e.g., 'i̱-, imchhobáchi' in Choctaw): remove this word
7. '∅': u'\u2205'
> (representing a zero affix in Takwane): w = w.replace(u'\u2205', '')
8. 'ø': u'\u00F8'
> (representing a zero affix in Takwane): w = w.replace(u'\u00F8', '')
9. '^0' (representing a zero affix in Sena3): w = w.replace('^0', '')
10. 'alt: ' in Sena3, e.g., 'pyalt: nkhundu-nkhundu': w = w.replace('alt: ', '')
11. Replace space(s) in the word with '\~': w = w.replace(' ', '\~')

In [1]:
# Post-process the data

def postprocessing(folderName):
    # Open files and read into data
    inputFileName_surf = '.' + '/' + folderName + '/' + 'words-surf.txt'
    inputFileName_SIG = '.' + '/' + folderName + '/' + 'words-SIG.txt'
    
    inputFile_surf = open(inputFileName_surf, 'r', encoding = 'utf-8')
    inputFile_SIG = open(inputFileName_SIG, 'r', encoding = 'utf-8')
    
    # There is a space (U+0020) at the end of each word, but no space at the end of each parse
    listOfWords_surf_input = [w.rstrip(' \n') for w in inputFile_surf.readlines()]
    listOfWords_SIG_input = [w.rstrip('\n') for w in inputFile_SIG.readlines()]
    
    # Open files for writing the contents
    outputFileName_surf = '.' + '/' + folderName + '/' + 'words-surf-preprocessed.txt'
    outputFileName_SIG = '.' + '/' + folderName + '/' + 'words-SIG-preprocessed.txt'

    outputFile_surf = open(outputFileName_surf, 'w', encoding = 'utf-8')
    outputFile_SIG = open(outputFileName_SIG, 'w', encoding = 'utf-8')

    # Join the word and parse separated by a unique character, e.g. '\t' (tab)
    listOfPairs = [pair[0] + '\t' + pair[1] for pair in zip(listOfWords_surf_input, listOfWords_SIG_input)]
    
    # Convert all characters to lowercase
    listOfPairs = [pair.lower() for pair in listOfPairs]
    
    listOfPairs_cleaned = []
    
    for pair in listOfPairs:
        # If the pair does not contain any ';', it means the word does not contain any affixes
        if ';' not in pair:
            pass
        # If the pair starts with '%'
        elif pair.startswith('%', 0, 2):
            pass
        # If the pair contains '1.1'
        elif '1.1' in pair:
            pass
        # If the pair contains '{' (or '{}')
        elif '{' in pair:
            pass
        # If the pair contains '}'
        elif '}' in pair:
            pass
        # If the pair contains ','
        elif ',' in pair:
            pass
        else:
            # Clean the pair
            # Remove '∅': u'\u2205'
            pair = pair.replace(u'\u2205', '')
            # Remove 'ø': u'\u00F8'
            pair = pair.replace(u'\u00F8', '')
            # Remove '^0'
            pair = pair.replace('^0', '')
            # Remove 'alt: '
            pair = pair.replace('alt: ', '')
            
            # Replace any character that is encoded as two characters (e.g. 'й')
            # with a one-character symbol (e.g. '$')
            # for Lezgi
            pair = pair.replace('й', '$')
            # for Sena
            pair = pair.replace('á', '#')
            
            # Replace space with '~'
            pair = pair.replace(' ', '~')
            
            listOfPairs_cleaned.append(pair)
    
    # Remove duplicate forms
    listOfPairs_cleaned = set(listOfPairs_cleaned)
    
    listOfWords_surf_output = []
    listOfWords_SIG_output = []
    
    for pair in listOfPairs_cleaned:
        # for surf
        surf = pair.split('\t')[0]
        # Separate each character with a space
        surf = ' '.join(surf)
        
        # for Lezgi
        surf = surf.replace('$', 'й')
        
        # for Sena
        surf = surf.replace('#', 'á')
        
        listOfWords_surf_output.append(surf)
        
        # for SIG
        SIG = pair.split('\t')[1]
        # Convert parses to uppercase
        SIG = SIG.upper()
        # Replace ';' with space
        SIG = SIG.replace(';', ' ')
        
        listOfWords_SIG_output.append(SIG)
    
    # Write the results into 'outputFile_surf' and 'outputFile_SIG', respectively
    outputFile_surf.write('\n'.join(listOfWords_surf_output))
    outputFile_SIG.write('\n'.join(listOfWords_SIG_output))
    
    # Close files
    inputFile_surf.close()
    inputFile_SIG.close()
    
    outputFile_surf.close()
    outputFile_SIG.close()

In [2]:
# Call the postprocessing() function by passing the name of the folder to this function
# For example, if the folder's name is 'Sena', you can run postprocessing('Sena')
# Then there will be two new files 'words-surf-preprocessed.txt' and 'words-SIG-preprocessed.txt'
# in the same folder
postprocessing('Lezgi-QusarDialect-nouns')

In [3]:
# Count the amount of tokens from data augmentation and after augmented data are post-processed

def counting(folderName):
    # Open files and read into data
    inputFileName_surf = '.' + '/' + folderName + '/' + 'words-surf.txt'
    inputFileName_SIG = '.' + '/' + folderName + '/' + 'words-SIG.txt'
    
    inputFile_surf = open(inputFileName_surf, 'r', encoding = 'utf-8')
    inputFile_SIG = open(inputFileName_SIG, 'r', encoding = 'utf-8')
    
    listOfWords_surf_input = [w.rstrip('\n') for w in inputFile_surf.readlines()]
    listOfWords_SIG_input = [w.rstrip('\n') for w in inputFile_SIG.readlines()]
    
    outputFileName_surf = '.' + '/' + folderName + '/' + 'words-surf-preprocessed.txt'
    outputFileName_SIG = '.' + '/' + folderName + '/' + 'words-SIG-preprocessed.txt'
    
    outputFile_surf = open(outputFileName_surf, 'r', encoding = 'utf-8')
    outputFile_SIG = open(outputFileName_SIG, 'r', encoding = 'utf-8')
    
    listOfWords_surf_output = [w.rstrip('\n') for w in outputFile_surf.readlines()]
    listOfWords_SIG_output = [w.rstrip('\n') for w in outputFile_SIG.readlines()]
    
    print('There are', len(listOfWords_surf_input)-1, 'words in', folderName, 'words-surf.txt;')
    print('There are', len(listOfWords_surf_output), 'words in', folderName,
          'words-surf-preprocessed.txt.')
    print('There are', len(listOfWords_SIG_input)-1, 'words in', folderName, 'words-SIG.txt;')
    print('There are', len(listOfWords_SIG_output), 'words in', folderName,
          'words-SIG-preprocessed.txt.')
    
    # Close files
    inputFile_surf.close()
    inputFile_SIG.close()
    
    outputFile_surf.close()
    outputFile_SIG.close()

In [4]:
# Call the counting() function by passing the name of the folder to this function
# For example, if the folder's name is 'Sena', you can run counting('Sena')
# Then the amount of tokens in each file will be presented
counting('Lezgi-QusarDialect-nouns')

There are 1942863 words in Lezgi-QusarDialect-nouns words-surf.txt;
There are 851447 words in Lezgi-QusarDialect-nouns words-surf-preprocessed.txt.
There are 1942863 words in Lezgi-QusarDialect-nouns words-SIG.txt;
There are 851447 words in Lezgi-QusarDialect-nouns words-SIG-preprocessed.txt.


In [5]:
# If you do not want to sort the output, you can skip this function

def sorting(folderName):
    # Open files and read into data
    inputFileName_surf_preprocessed = '.' + '/' + folderName + '/' + 'words-surf-preprocessed.txt'
    inputFileName_SIG_preprocessed = '.' + '/' + folderName + '/' + 'words-SIG-preprocessed.txt'
    
    inputFile_surf_preprocessed = open(inputFileName_surf_preprocessed, 'r', encoding = 'utf-8')
    inputFile_SIG_preprocessed = open(inputFileName_SIG_preprocessed, 'r', encoding = 'utf-8')
        
    listOfWords_surf_input_preprocessed = [w.rstrip('\n') for w in inputFile_surf_preprocessed.readlines()]
    listOfWords_SIG_input_preprocessed = [w.rstrip('\n') for w in inputFile_SIG_preprocessed.readlines()]
    
    # Open files for writing the contents
    outputFileName_surf_preprocessed = '.' + '/' + folderName + '/' + 'surf-preprocessed-sorted.txt'
    outputFileName_SIG_preprocessed = '.' + '/' + folderName + '/' + 'SIG-preprocessed-sorted.txt'
    
    outputFile_surf_preprocessed = open(outputFileName_surf_preprocessed, 'w', encoding = 'utf-8')
    outputFile_SIG_preprocessed = open(outputFileName_SIG_preprocessed, 'w', encoding = 'utf-8')
    
    # Join the word and parse separated by a unique char, e.g. '\t'
    listOfPairs_preprocessed = ([pair[0] + '\t' + pair[1] for
                                 pair in zip(listOfWords_surf_input_preprocessed,
                                             listOfWords_SIG_input_preprocessed)])
    
    listOfPairs_preprocessed.sort()
    
    surf_preprocessed_sorted = [pair.split('\t')[0] for pair in listOfPairs_preprocessed]
    SIG_preprocessed_sorted = [pair.split('\t')[1] for pair in listOfPairs_preprocessed]
    
    surf_preprocessed_results = '\n'.join(surf_preprocessed_sorted)
    SIG_preprocessed_results = '\n'.join(SIG_preprocessed_sorted)
    
    outputFile_surf_preprocessed.write(surf_preprocessed_results)
    outputFile_SIG_preprocessed.write(SIG_preprocessed_results)
    
    # Close the files
    inputFile_surf_preprocessed.close()
    inputFile_SIG_preprocessed.close()
    
    outputFile_surf_preprocessed.close()
    outputFile_SIG_preprocessed.close()

In [6]:
# Call the sorting() function by passing the name of the folder to this function
# For example, if the folder's name is 'Sena', you can run sorting('Sena')
# Then there will be two new files 'surf-preprocessed-sorted.txt' and 'SIG-preprocessed-sorted.txt'
# in the same folder
sorting('Lezgi-QusarDialect-nouns')