# ENNI-B1 Morphosyntactic Analysis with Batchalign in Google Colab

This notebook performs morphosyntactic analysis on ENNI-B1 CHAT files using batchalign with GPU acceleration.

In [None]:
# Step 1: Set up GPU and install dependencies
import os
import subprocess
from pathlib import Path
from datetime import datetime
import shutil
import requests
import zipfile
import io

# Check GPU availability
!nvidia-smi

# Install required packages
!pip install batchalign torch numpy==1.24.0

print('âœ“ Environment setup complete')

In [None]:
# Step 2: Download ENNI-B1 files from GitHub
def download_github_repo(repo_url, target_dir):
    """Download a GitHub repository as a zip file and extract it"""
    # Create target directory
    Path(target_dir).mkdir(parents=True, exist_ok=True)
    
    # Download the repository as zip
    zip_url = repo_url.replace('github.com', 'github.com').replace('.git', '') + '/archive/refs/heads/master.zip'
    
    print(f"Downloading {repo_url}...")
    response = requests.get(zip_url)
    
    if response.status_code == 200:
        # Extract the zip file
        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
            zip_ref.extractall(target_dir)
        
        print(f'âœ“ Downloaded and extracted to {target_dir}')
        return True
    else:
        print(f'âœ— Failed to download: HTTP {response.status_code}')
        return False
    
# Download your ICL-PILOT repository
github_repo = 'https://github.com/your-username/ICL-PILOT.git'  # Replace with your actual GitHub repo
data_dir = 'icl_pilot_data'

if download_github_repo(github_repo, data_dir):
    print('âœ“ Repository downloaded successfully')
    
    # Check what ENNI-B1 directories we have
    enni_dirs = [
        f'{data_dir}/ENNI_B1_TD',
        f'{data_dir}/ENNI_B1_DLD',
        f'{data_dir}/synthetic_data/ENNI_B1'
    ]
    
    print('\nENNI-B1 directories found:')
    for enni_dir in enni_dirs:
        if os.path.exists(enni_dir):
            cha_count = len(list(Path(enni_dir).rglob('*.cha')))
            print(f'  - {enni_dir}: {cha_count} .cha files')
        else:
            print(f'  - {enni_dir}: NOT FOUND')
else:
    print('âœ— Failed to download repository')

In [None]:
# Step 3: Set up output directories
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_base = 'analysis_results'
utseg_output = f'{output_base}/utseg_results_{timestamp}'
morphotag_output = f'{output_base}/morphotag_results_{timestamp}'

os.makedirs(utseg_output, exist_ok=True)
os.makedirs(morphotag_output, exist_ok=True)

print(f'Output directories created:')
print(f'  - Utseg: {utseg_output}')
print(f'  - Morphotag: {morphotag_output}')

In [None]:
# Step 4: Run batchalign utseg (utterance segmentation)
def run_batchalign_command(command, input_dir, output_dir, log_file):
    """Run a batchalign command with proper error handling"""
    full_command = f'batchalign {command} {input_dir} {output_dir}'
    
    print(f'Running: {full_command}')
    
    try:
        result = subprocess.run(
            full_command.split(), 
            capture_output=True, 
            text=True,
            timeout=900  # 15 minutes timeout
        )
        
        with open(log_file, 'w') as f:
            f.write(f'Command: {full_command}\n\n')
            f.write(f'Return code: {result.returncode}\n\n')
            f.write('STDOUT:\n')
            f.write(result.stdout)
            f.write('\nSTDERR:\n')
            f.write(result.stderr)
        
        if result.returncode == 0:
            print(f'âœ“ Successfully completed {command}')
            return True
        else:
            print(f'âœ— Failed {command} (return code: {result.returncode})')
            print(f'  See log: {log_file}')
            return False
            
    except subprocess.TimeoutExpired:
        print(f'âœ— {command} timed out after 15 minutes')
        with open(log_file, 'w') as f:
            f.write(f'Command timed out: {full_command}')
        return False
    except Exception as e:
        print(f'âœ— Exception running {command}: {str(e)}')
        with open(log_file, 'w') as f:
            f.write(f'Exception: {str(e)}')
        return False
    
# Run utseg on each directory
utseg_success = 0
utseg_total = 0

print('\nStep 4: Running utseg (utterance segmentation)')
print('=' * 60)

for enni_dir in enni_dirs:
    if os.path.exists(enni_dir):
        utseg_total += 1
        dir_name = os.path.basename(enni_dir)
        dir_output = os.path.join(utseg_output, dir_name)
        os.makedirs(dir_output, exist_ok=True)
        
        log_file = os.path.join(dir_output, f'{dir_name}_utseg.log')
        
        if run_batchalign_command('utseg', enni_dir, dir_output, log_file):
            utseg_success += 1
    else:
        print(f'Failed on {enni_dir}')

print(f'\nUtseg results: {utseg_success}/{utseg_total} directories processed')

In [None]:
# Step 5: Run batchalign morphotag (morphosyntactic tagging)
if utseg_success > 0:
    print('\nStep 5: Running morphotag (morphosyntactic tagging)')
    print('=' * 60)
    
    log_file = os.path.join(morphotag_output, 'morphotag.log')
    
    if run_batchalign_command('morphotag --retokenize', utseg_output, morphotag_output, log_file):
        print('\nâœ“ Morphosyntactic analysis complete!')
    else:
        print('\nâœ— Morphotag failed')
else:
    print('\nâœ— Skipping morphotag since utseg failed')

In [None]:
# Step 6: Generate summary and download results
def generate_summary():
    summary = f'ENNI-B1 Morphosyntactic Analysis Summary
',
    'Generated on: {datetime.now()}
',
'\nEnvironment:
',
'  - Google Colab with GPU: {torch.cuda.is_available()}
',
'  - PyTorch version: {torch.__version__}
',
'  - NumPy version: {import numpy; numpy.__version__}
',
'\nInput directories:
'
    
    for enni_dir in enni_dirs:
        if os.path.exists(enni_dir):
            cha_count = len(list(Path(enni_dir).rglob('*.cha')))
            summary += f'  - {enni_dir}: {cha_count} files\n'
    
    summary += f'\nProcessing results:\n'
    summary += f'  - Utseg: {utseg_success}/{utseg_total} directories\n'
    
    if utseg_success > 0:
        morphotag_status = 'Completed' if os.path.exists(morphotag_output) else 'Failed'
        summary += f'  - Morphotag: {morphotag_status}\n'
    else:
        summary += f'  - Morphotag: Skipped (utseg failed)\n'
    
    summary += f'\nOutput directories:\n'
    summary += f'  - Utseg results: {utseg_output}\n'
    summary += f'  - Morphotag results: {morphotag_output}\n'
    summary += f'\nAll log files are available in the respective output directories.\n'
    
    return summary
    
# Generate and display summary
summary_content = generate_summary()
print(summary_content)

# Save summary to file
summary_file = f'{output_base}/analysis_summary_{timestamp}.txt'
with open(summary_file, 'w') as f:
    f.write(summary_content)

print(f'\nSummary saved to: {summary_file}')

# Create a zip file of all results for easy download
results_zip = f'{output_base}/enni_b1_analysis_results_{timestamp}.zip'

print(f'\nCreating results archive: {results_zip}')

with zipfile.ZipFile(results_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(output_base):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, output_base)
            zipf.write(file_path, arcname)

print(f'âœ“ Results archive created: {results_zip}')

# Provide download link
from google.colab import files
print(f'\nðŸ“¥ Download your results:')
files.download(results_zip)

print(f'\nðŸŽ‰ Analysis complete! You can also access all files in: {output_base}')

## Alternative Approach: Manual Analysis with spaCy

If batchalign continues to have issues, here's an alternative using spaCy for morphosyntactic analysis:

In [None]:
# Install spaCy and download English model
!pip install spacy
!python -m spacy download en_core_web_sm

import spacy
from pathlib import Path

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

def analyze_with_spacy(cha_file, output_dir):
    """Analyze a CHAT file using spaCy"""
    try:
        with open(cha_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extract utterances (simplified - you'd need proper CHAT parsing)
        doc = nlp(content)
        
        # Save analysis results
        output_file = Path(output_dir) / f'{Path(cha_file).stem}_spacy_analysis.txt'
        
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f'File: {cha_file}\n')
            f.write(f'Tokens: {len(doc)}\n')
            f.write(f'Sentences: {len(list(doc.sents))}\n')
            f.write('\nMorphosyntactic Analysis:\n')
            
            for token in doc[:50]:  # First 50 tokens as example
                f.write(f'{token.text}\t{token.pos_}\t{token.tag_}\t{token.dep_}\n')
        
        return True
        
    except Exception as e:
        print(f'Error analyzing {cha_file}: {e}')
        return False
    
# Example usage
print('Testing spaCy analysis on one file...')
test_file = next(Path('./ENNI_B1_TD').glob('*.cha')) if Path('./ENNI_B1_TD').exists() else None

if test_file:
    analyze_with_spacy(test_file, './spacy_analysis')
    print(f'âœ“ spaCy analysis complete for {test_file}')
else:
    print('No test file found')

## Troubleshooting Guide

### Common Issues and Solutions:

1. **GPU not detected**: Make sure you've selected GPU in Colab (Runtime > Change runtime type > GPU)

2. **Batchalign hanging**: Try reducing the number of files or use the `--num_speakers` parameter

3. **Memory issues**: Colab has limited memory. Process files in smaller batches.

4. **Timeout errors**: Increase the timeout in the `run_batchalign_command` function.

### Manual CHAT File Analysis:

If automated tools fail, you can manually analyze CHAT files using:
- **CLAN software** from TalkBank (http://talkbank.org/clan/)
- **Custom Python scripts** using regular expressions to extract linguistic features
- **Excel/CSV conversion** for manual coding and analysis