In [1]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.7.4-cp39-cp39-macosx_10_9_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting spacy-loggers<2.0.0,>=1.0.0
  Using cached spacy_loggers-1.0.5-py3-none-any.whl (22 kB)
Collecting thinc<8.3.0,>=8.2.2
  Downloading thinc-8.2.3-cp39-cp39-macosx_10_9_x86_64.whl (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.3/880.3 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.8-cp39-cp39-macosx_10_9_x86_64.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typer<0.10.0,>=0.3.0
  Downloading typer-0.9.4-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting srsly<

In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [1]:
#  step 1: processing the textx: lemma, pos tag, and parse the reference native speaker corpus

import spacy

# Load the spaCy model for English
nlp = spacy.load("en_core_web_lg")

# Define the path to your text file
file_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/SM_ENS.txt'

# Process the text file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Process the text with spaCy
doc = nlp(text)

# Define the name of the output file
output_file_path = file_path.replace('.txt', '_processed.txt')

# Open the output file and write the lemmatized form, POS tag, and dependency parse
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for token in doc:
        output_file.write(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\n")

# Note: This will create a file named `file_processed.txt` with the processed information.


In [7]:
pwd

'/Users/shuyuan/Desktop/CSSMA-master/Corpus linguistics final project'

In [8]:
# step 2: extract the dependecies, raw freq, and normed freq

import spacy
import csv
from collections import Counter

# Load the spaCy model
nlp = spacy.load("en_core_web_lg")

# Initialize counters for each dependency type
dep_counters = {
    'amod': Counter(),
    'advmod': Counter(),
    'dobj': Counter()
}

# Process the text file
input_file_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/SM_ENS.txt'
output_csv_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/SM_ENS.txt/reference corpus_dependecy list.csv'

with open(input_file_path, 'r', encoding='utf-8') as input_file:
    text = input_file.read()
    doc = nlp(text)

    # Count dependencies
    for token in doc:
        if token.dep_ in dep_counters:
            dep_counters[token.dep_][(token.head.text, token.text)] += 1

# Calculate normalized frequencies
norm_factors = {'amod': 1000, 'advmod': 1000, 'dobj': 1000}  # Example normalization factors
normed_freqs = {
    dep: {pair: (count / sum(counter.values()) * norm_factors[dep])
          for pair, count in counter.items()}
    for dep, counter in dep_counters.items()
}

# Prepare data for CSV output, including raw and normed frequencies
rows = []
max_len = max(len(counter) for counter in dep_counters.values())
for i in range(max_len):
    row = []
    for dep, counter in dep_counters.items():
        if i < len(counter):
            pair, raw_freq = counter.most_common()[i]
            # Adjust formatting based on dependency type, taking the new definition into account
            if dep == 'amod':  # Adjectival modifier: adjective + noun
                formatted_pair = f"{pair[1]} {pair[0]}"
            elif dep == 'advmod':  # Adverbial modifier: adverb + adjective or adverb
                formatted_pair = f"{pair[1]} {pair[0]}"
            elif dep == 'dobj':  # Direct object: verb + object
                formatted_pair = f"{pair[0]}+{pair[1]}"
            # Retrieve the normalized frequency of the word pair
            normed_freq = normed_freqs[dep][pair]
            # Extend the current row with the formatted word pair, raw frequency, and normalized frequency
            row.extend([formatted_pair, raw_freq, f"{normed_freq:.2f}"])
        else:
            # Fill the row with placeholders if there's no data for the current index
            row.extend(["", "", ""])
    # Append the completed row to the list of rows
    rows.append(row)


# The rest of the CSV writing process remains the same



# Write to CSV
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    headers = []
    for dep in ['AMOD', 'ADVMOD', 'DOBJ']:
        headers.extend([f"{dep} Pair", f"{dep} Raw Freq", f"{dep} Normed Freq"])
    csvwriter.writerow(headers)  # Header
    csvwriter.writerows(rows)

print("CSV file with raw and normed frequencies has been created.")


NotADirectoryError: [Errno 20] Not a directory: '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/SM_ENS.txt/reference corpus_dependecy list.csv'

In [3]:
# extract dependencies, freq, and normed freq from reference corpus
import spacy
import csv
from collections import Counter

# Load the spaCy model
nlp = spacy.load("en_core_web_lg")

# Initialize counters for each type of dependency pair
amod_counter = Counter()
advmod_counter = Counter()
dobj_counter = Counter()

# Process the text file

input_file_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/SM_ENS.txt'
output_csv_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/reference_SM.csv'


with open(input_file_path, 'r', encoding='utf-8') as input_file:
    text = input_file.read()
    doc = nlp(text)

    # Iterate through tokens in the document
    for token in doc:
        # For AMOD: Adjective + Noun
        if token.dep_ == 'amod' and token.head.pos_ == 'NOUN':
            amod_counter[(token.text, token.head.text)] += 1
        
        # For ADVMOD: Adverb + Adjective/Adverb/Noun
        elif token.dep_ == 'advmod':
            if token.head.pos_ in ['ADJ', 'ADV', 'NOUN']:
                advmod_counter[(token.text, token.head.text)] += 1
        
        # For DOBJ: Verb + Direct Object
        elif token.dep_ == 'dobj' and token.head.pos_ == 'VERB':
            dobj_counter[(token.head.text, token.text)] += 1

# Write the CSV file with specified headers
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    
    # Define the headers
    headers = [
        "AMOD Pair", "AMOD Raw Freq", "AMOD Normed Freq",
        "ADVMOD Pair", "ADVMOD Raw Freq", "ADVMOD Normed Freq",
        "DOBJ Pair", "DOBJ Raw Freq", "DOBJ Normed Freq"
    ]
    csvwriter.writerow(headers)
    
    # Calculate normalization factor as the sum of all counts for each dependency type
    amod_norm_factor = 1000 / sum(amod_counter.values())
    advmod_norm_factor = 1000 / sum(advmod_counter.values())
    dobj_norm_factor = 1000 / sum(dobj_counter.values())

    # Assuming the same number of entries for each type, iterate and write to CSV
    for ((amod_pair, amod_freq), (advmod_pair, advmod_freq), (dobj_pair, dobj_freq)) in zip(
        amod_counter.most_common(), advmod_counter.most_common(), dobj_counter.most_common()):

        # Normalize frequencies
        amod_norm = amod_freq * amod_norm_factor
        advmod_norm = advmod_freq * advmod_norm_factor
        dobj_norm = dobj_freq * dobj_norm_factor
        
        # Write the rows as specified
        csvwriter.writerow([
            ' '.join(amod_pair), amod_freq, f"{amod_norm:.2f}",
            ' '.join(advmod_pair), advmod_freq, f"{advmod_norm:.2f}",
            ' '.join(dobj_pair), dobj_freq, f"{dobj_norm:.2f}"
        ])

print("CSV file with structured dependencies has been created.")


CSV file with structured dependencies has been created.


In [1]:
# calculating the MI score in the reference corpus-updated

import csv
import math
from collections import defaultdict

# Make sure to use the correct path after you upload the CSV to the server

input_csv_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/reference_SM.csv'
output_csv_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/reference_SM_MI.csv'


# Initialize counters and variables for MI calculation
word_counts = defaultdict(int)
pair_counts = defaultdict(int)
total_count = 0

# Process the CSV and calculate word and pair counts
data_rows = []
with open(input_csv_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    # Store the data in memory to use after the file is closed
    data_rows = [row for row in reader]
    for row in data_rows:
        for dep_type in ['AMOD', 'ADVMOD', 'DOBJ']:
            pair = row[f"{dep_type} Pair"]
            if pair:  # Check if the pair exists
                raw_freq = int(row[f"{dep_type} Raw Freq"])
                word1, word2 = pair.split()
                pair_counts[(dep_type, word1, word2)] += raw_freq
                word_counts[word1] += raw_freq
                word_counts[word2] += raw_freq
                total_count += raw_freq

# Calculate PMI scores for each dependency type
pmi_scores = {}
for (dep_type, word1, word2), pair_freq in pair_counts.items():
    p_word1 = word_counts[word1] / total_count
    p_word2 = word_counts[word2] / total_count
    p_pair = pair_freq / total_count
    pmi = math.log2(p_pair / (p_word1 * p_word2))
    pmi_scores[(dep_type, word1, word2)] = pmi

# Write the new CSV file with MI scores
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = reader.fieldnames + ['AMOD MI Score', 'ADVMOD MI Score', 'DOBJ MI Score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # Iterate over the stored data
    for row in data_rows:
        new_row = row.copy()  # Work with a copy of the row
        for dep_type in ['AMOD', 'ADVMOD', 'DOBJ']:
            pair = row[f"{dep_type} Pair"]
            if pair:
                word1, word2 = pair.split()
                # Get the PMI score from the calculated values
                pmi_score = pmi_scores.get((dep_type, word1, word2))
                new_row[f"{dep_type} MI Score"] = f"{pmi_score:.4f}" if pmi_score is not None else ""
        writer.writerow(new_row)

print(f"CSV file with MI scores has been created: {output_csv_path}")


CSV file with MI scores has been created: /Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/reference_SM_MI.csv


In [28]:
# clean the files by adding the MI scores to each proficiency level

import pandas as pd

# Load the two CSV files
MIscore_proficiency = pd.read_csv('/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/MI score/DOBJ/DOBJ_B1_2_cleaned_1.csv')
reference_scores = pd.read_csv('/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/reference_SM_MI.csv')

# Assuming 'student_id' is the common column in both CSV files
# and 'reference_score' is the column name in the reference scores file that you want to add
merged_df = pd.merge(MIscore_proficiency, reference_scores[['DOBJ Pair', 'DOBJ MI Score']], on='DOBJ Pair', how='left')

# Export the merged DataFrame to a new CSV file
merged_df.to_csv('DOBJ_B1_2_MI.csv', index=False)


In [24]:
# data cleaning: replace all the " + " with "+" to find the matched DOBJ

import csv

# Specify the path to your input and output CSV files
input_csv_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/MI score/DOBJ/DOBJ_B1_2_cleaned.csv'
output_csv_path = '/Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/MI score/DOBJ/DOBJ_B1_2_cleaned_1.csv'

# Open the input CSV file for reading
with open(input_csv_path, mode='r', encoding='utf-8') as infile:
    # Open the output CSV file for writing
    with open(output_csv_path, mode='w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        
        # Iterate through each row in the input CSV
        for row in reader:
            # Check if the row is not empty
            if row:
                # Replace " + " with "+" in the first column
                row[0] = row[0].replace("+", " ")
            # Write the modified row to the output CSV
            writer.writerow(row)

print("The file has been processed and saved to:", output_csv_path)


The file has been processed and saved to: /Users/shuyuan/Desktop/CSSMA-master/Chinese spoken corpus/sophistication/MI score/DOBJ/DOBJ_B1_2_cleaned_1.csv


In [60]:
# add '+' to clean the dobj data in reference corpus

import csv

# Input and output file paths
input_csv_path = '/Users/shuyuan/Desktop/CSSMA-master/Corpus linguistics final project/native speaker reference corpus/reference_corpus_MI_score.csv'
output_csv_path = '/Users/shuyuan/Desktop/CSSMA-master/Corpus linguistics final project/native speaker reference corpus/reference_corpus_MI_score_cleaned.csv'

# Read the CSV, modify the "DOBJ Pair" column, and write to a new CSV
with open(input_csv_path, 'r', encoding='utf-8') as input_csvfile, \
     open(output_csv_path, 'w', newline='', encoding='utf-8') as output_csvfile:
    
    reader = csv.DictReader(input_csvfile)
    writer = csv.DictWriter(output_csvfile, fieldnames=reader.fieldnames)
    
    writer.writeheader()
    for row in reader:
        # Modify the "DOBJ Pair" by adding "+" between the words
        if row['DOBJ Pair']:
            words = row['DOBJ Pair'].split()
            row['DOBJ Pair'] = '+'.join(words)
        
        writer.writerow(row)

print(f"CSV file with modified 'DOBJ Pair' column has been created: {output_csv_path}")


CSV file with modified 'DOBJ Pair' column has been created: /Users/shuyuan/Desktop/CSSMA-master/Corpus linguistics final project/native speaker reference corpus/reference_corpus_MI_score_cleaned.csv
