In [21]:
import spacy

In [45]:
# word calculator 
import os

def count_words_in_folder(folder_path):
    total_words = 0
    # Iterate over all files in the directory
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):  # Make sure to process only text files
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                contents = file.read()
                total_words += len(contents.split())  # Split the file into words and count them
    return total_words

# Replace 'your/folder/path' with the path to your folder
folder_path = '/home/user/Corpus linguistics final project/Chinese learner corpus/WE_CHN_B1_2_N210'
word_count = count_words_in_folder(folder_path)
print(f"The total number of words in the folder is: {word_count}")


The total number of words in the folder is: 50774


In [3]:
# lemma, POS tag, parse and calculate RTTR based on RTTR function
import os
import csv
import spacy
from collections import Counter
import math


# Function to calculate root Type-Token Ratio (RTTR)
def calculate_rttr(types):
    total_tokens = sum(types.values()) # Total number of occurrences of dependency types (T)
    unique_types = len(types)          # Number of unique dependency types (N)
    return total_tokens / math.sqrt(unique_types) if unique_types > 0 else 0


# Initialize SpaCy
nlp = spacy.load("en_core_web_lg")

# Path to the folder containing text files
folder_path = "/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/ICNALE_SM/ICNALE_SM_CHN_A2_0_N056"

# CSV output file
output_csv = "SM_phraseological_diversity_A2.csv"

# Header for the CSV file
csv_header = ['Filename', 'AMOD TTR', 'ADVMOD TTR', 'DOBJ TTR']

# Process each text file in the folder
results = []
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):  # Ensure we're only processing .txt files
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            text = file.read()

        # Process the text with SpaCy
        doc = nlp(text)

        # Collect tokens for each relation
        relations = {'amod': Counter(), 'advmod': Counter(), 'dobj': Counter()}
        for token in doc:
            if token.dep_ in relations:
                relations[token.dep_].update([(token.head.lemma_, token.lemma_)])

        # Calculate TTR for each relation
        amod_rttr = calculate_rttr(relations['amod'])
        advmod_rttr = calculate_rttr(relations['advmod'])
        dobj_rttr = calculate_rttr(relations['dobj'])

        # Append the results for this file
        results.append([filename, amod_rttr, advmod_rttr, dobj_rttr])

# Write the results to a CSV file
with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(csv_header)  # Write the header
    writer.writerows(results)  # Write the content

print(f"Phraseological diversity metrics written to {output_csv}")


Phraseological diversity metrics written to SM_phraseological_diversity_A2.csv


In [9]:
# include calculation of the mean and sd
import os
import csv
import spacy
from collections import Counter
import math
import statistics  # Import statistics module for mean and standard deviation calculations

# Function to calculate root Type-Token Ratio (RTTR)
def calculate_rttr(types):
    total_tokens = sum(types.values())  # Total number of occurrences of dependency types (T)
    unique_types = len(types)           # Number of unique dependency types (N)
    return total_tokens / math.sqrt(unique_types) if unique_types > 0 else 0

# Initialize SpaCy
nlp = spacy.load("en_core_web_lg")

# Path to the folder containing text files
folder_path = "/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/ICNALE_SM/ICNALE_SM_CHN_B1_2_N312"

# CSV output file
output_csv = "SM_phraseological_diversity_B1_2.csv"

# Header for the CSV file
csv_header = ['Filename', 'AMOD TTR', 'ADVMOD TTR', 'DOBJ TTR']

# Process each text file in the folder
results = []
amod_ttrs = []
advmod_ttrs = []
dobj_ttrs = []
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):  # Ensure we're only processing .txt files
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Process the text with SpaCy
        doc = nlp(text)

        # Collect tokens for each relation
        relations = {'amod': Counter(), 'advmod': Counter(), 'dobj': Counter()}
        for token in doc:
            if token.dep_ in relations:
                relations[token.dep_].update([(token.head.lemma_, token.lemma_)])

        # Calculate TTR for each relation
        amod_rttr = calculate_rttr(relations['amod'])
        advmod_rttr = calculate_rttr(relations['advmod'])
        dobj_rttr = calculate_rttr(relations['dobj'])

        # Append TTR values to their respective lists
        amod_ttrs.append(amod_rttr)
        advmod_ttrs.append(advmod_rttr)
        dobj_ttrs.append(dobj_rttr)

        # Append the results for this file
        results.append([filename, amod_rttr, advmod_rttr, dobj_rttr])

# Calculate mean and standard deviation for each TTR category
mean_amod = statistics.mean(amod_ttrs) if amod_ttrs else 0
sd_amod = statistics.stdev(amod_ttrs) if len(amod_ttrs) > 1 else 0
mean_advmod = statistics.mean(advmod_ttrs) if advmod_ttrs else 0
sd_advmod = statistics.stdev(advmod_ttrs) if len(advmod_ttrs) > 1 else 0
mean_dobj = statistics.mean(dobj_ttrs) if dobj_ttrs else 0
sd_dobj = statistics.stdev(dobj_ttrs) if len(dobj_ttrs) > 1 else 0

# Write the results to a CSV file
with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(csv_header)  # Write the header
    writer.writerows(results)  # Write the individual file results

    # Write summary statistics
    writer.writerow(['Mean AMOD TTR', mean_amod])
    writer.writerow(['SD AMOD TTR', sd_amod])
    writer.writerow(['Mean ADVMOD TTR', mean_advmod])
    writer.writerow(['SD ADVMOD TTR', sd_advmod])
    writer.writerow(['Mean DOBJ TTR', mean_dobj])
    writer.writerow(['SD DOBJ TTR', sd_dobj])

print(f"Phraseological diversity metrics written to {output_csv}")



Phraseological diversity metrics written to SM_phraseological_diversity_B1_2.csv


In [6]:
pwd

'/Users/shuyuan/Desktop/CSSMA-master/Corpus linguistics final project'

In [10]:
# generat frequency list & calculate raw freq & normed freq
import os
import csv
import spacy
from collections import Counter

# Initialize SpaCy
nlp = spacy.load("en_core_web_lg")

# Path to the folder containing text files
folder_path = "/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/ICNALE_SM/ICNALE_SM_CHN_A2_0_N056"

# Output CSV file for the frequency list
output_csv_freq = "dependency_frequencies_A2.csv"

# Collect dependencies separately
dependencies = {'amod': Counter(), 'advmod': Counter(), 'dobj': Counter()}
total_word_count = 0

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):  # Ensure we're only processing .txt files
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            text = file.read()

        # Process the text with SpaCy
        doc = nlp(text)
        total_word_count += len(doc)

        # Collect tokens for each relation
        for token in doc:
            if token.dep_ in dependencies:
                dep_pair = (token.head.lemma_, token.lemma_)  # Pair of governor and dependent
                dependencies[token.dep_][dep_pair] += 1

# Filter out dependencies that occur only once and sort
sorted_dependencies = {}
for dep, counter in dependencies.items():
    sorted_deps = {pair: freq for pair, freq in counter.items() if freq > 1}
    sorted_deps = sorted(sorted_deps.items(), key=lambda item: item[1], reverse=True)
    sorted_dependencies[dep] = [(pair, freq, (freq / total_word_count) * 100) for pair, freq in sorted_deps]

# Determine the maximum length of the lists
max_length = max(len(sorted_dependencies[dep]) for dep in sorted_dependencies)

# Write the sorted and normalized frequencies to a CSV file
with open(output_csv_freq, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    # Write the headers
    writer.writerow(['AMOD Dependency', 'AMOD Raw Freq', 'AMOD Norm Freq (per 100 words)',
                     'ADVMOD Dependency', 'ADVMOD Raw Freq', 'ADVMOD Norm Freq (per 100 words)',
                     'DOBJ Dependency', 'DOBJ Raw Freq', 'DOBJ Norm Freq (per 100 words)'])

    # Write the content
    for i in range(max_length):
        row = []
        for dep in ['amod', 'advmod', 'dobj']:
            # Check if the index exists for the dependency list
            if i < len(sorted_dependencies[dep]):
                dep_pair_str = ' - '.join(sorted_dependencies[dep][i][0])  # Convert tuple to string
                row.extend([dep_pair_str] + list(sorted_dependencies[dep][i][1:]))
            else:
                # If there are no more dependencies for this type, add empty values
                row.extend(['', '', ''])
        writer.writerow(row)

print(f"Dependency frequency list written to {output_csv_freq}")



Dependency frequency list written to dependency_frequencies_A2.csv


In [14]:
# updated version with the large spacy, USE THIS ONE!!


import csv
import spacy
from collections import Counter

# Initialize SpaCy
nlp = spacy.load("en_core_web_lg")

# Path to the file containing the text
file_path = "/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/ICNALE_SM/SM_CHN_B1_2.txt"

# Output CSV file for the frequency list
output_csv_freq = "dependency_frequencies_B1_2.csv"

# Collect dependencies separately
dependencies = {'amod': Counter(), 'advmod': Counter(), 'dobj': Counter()}

# Read the text file
with open(file_path, 'r') as file:
    text = file.read()

# Process the text with SpaCy
doc = nlp(text)
total_word_count = len(doc)

# Collect tokens for each relation
for token in doc:
    if token.dep_ == 'amod' and token.head.pos_ == 'NOUN':
        dependencies['amod'][f"{token.text} {token.head.text}"] += 1
    elif token.dep_ == 'advmod' and (token.head.pos_ == 'ADJ' or token.head.pos_ == 'VERB'):
        dependencies['advmod'][f"{token.text} {token.head.text}"] += 1
    elif token.dep_ == 'dobj':
        dependencies['dobj'][f"{token.head.text} + {token.text}"] += 1

# Filter out dependencies that occur only once and sort
sorted_dependencies = {}
for dep, counter in dependencies.items():
    sorted_deps = {pair: freq for pair, freq in counter.items() if freq > 1}
    sorted_deps = sorted(sorted_deps.items(), key=lambda item: item[1], reverse=True)
    sorted_dependencies[dep] = [(pair, freq, (freq / total_word_count) * 100) for pair, freq in sorted_deps]

# Write the sorted and normalized frequencies to a CSV file
with open(output_csv_freq, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    # Write the headers
    writer.writerow(['AMOD Pair', 'AMOD Raw Freq', 'AMOD Norm Freq (per 100 words)',
                     'ADVMOD Pair', 'ADVMOD Raw Freq', 'ADVMOD Norm Freq (per 100 words)',
                     'DOBJ Pair', 'DOBJ Raw Freq', 'DOBJ Norm Freq (per 100 words)'])

    # Get the max length of dependency lists
    max_length = max(len(sorted_dependencies[dep]) for dep in sorted_dependencies)

    # Write the content
    for i in range(max_length):
        row = []
        for dep in ['amod', 'advmod', 'dobj']:
            if i < len(sorted_dependencies[dep]):
                # Write the pair, raw frequency, and normalized frequency
                row.extend([sorted_dependencies[dep][i][0], sorted_dependencies[dep][i][1], sorted_dependencies[dep][i][2]])
            else:
                # If there are no more dependencies for this type, add empty values
                row.extend(['', '', ''])
        writer.writerow(row)

print(f"Dependency frequency list written to {output_csv_freq}")


Dependency frequency list written to dependency_frequencies_B1_2.csv
