# Sequential tagging

## Part-of-speech Tagging

In [None]:
import pandas as pd
from textblob import TextBlob

# Read data
df = pd.read_csv("processed_cleaned_data.csv")

# Define a function for POS tagging
def pos_tag_text(text):
    blob = TextBlob(text)
    return blob.tags  # Return the POS tagging results

# Apply the function to each row of text and create a new column 'pos_tags'
df['pos_tags'] = df['text'].apply(pos_tag_text)

# Expand the pos_tags column into separate rows
pos_df = df.explode('pos_tags')

# Split into two new columns: word and tag
pos_df[['word', 'tag']] = pos_df['pos_tags'].apply(pd.Series)

# Count duplicates and add to a new column 'number'
pos_df['number'] = pos_df.groupby(['word', 'tag'])['word'].transform('count')

# Select the required columns
final_df = pos_df[['word', 'tag', 'number']].drop_duplicates()

# Print the first few rows to confirm
print(final_df.head())

# Save the results to a new CSV file
output_file = "pos_tagged_result.csv"
final_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Tokenization completed and saved to '{output_file}'")

## Shallow Parsing (Chunking) SpaCy

In [None]:
import spacy
import pandas as pd
from spacy import displacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Read the data
df = pd.read_csv("processed_cleaned_data.csv")
texts = df['text']

# Randomly sample 10 texts
sampled_texts = texts.sample(n=10, random_state=5)  # Set random_state for reproducibility

# Perform dependency parsing and visualization for each sampled text
for text in sampled_texts:
    doc = nlp(text)
    print(f"Processing text: {text}\n")
    
    # Print dependency information for each token
    for token in doc:
        print(token.text, token.dep_, token.head.text, token.head.pos_,
              [child.text for child in token.children])
    
    # Visualize dependency parsing
    displacy.render(doc, style="dep")
    
    print("\n" + "="*50 + "\n")  # Separator for different text outputs

In [None]:
from pathlib import Path

# Perform dependency parsing and visualization for each sampled text
# Define output directory
output_dir = Path("images")  # Ensure the output directory exists

# Create directory (if it does not exist)
output_dir.mkdir(exist_ok=True)

for i, text in enumerate(sampled_texts):
    doc = nlp(text)

    # Visualize dependency parsing and generate SVG
    svg = displacy.render(doc, style="dep", jupyter=False)

    # Generate file name, removing punctuation
    file_name = '-' + f"_{i}.svg"  # Add index to avoid name collisions
    output_path = output_dir / file_name  # Use Path object to concatenate paths

    # Save the SVG file
    with output_path.open("w", encoding="utf-8") as f:
        f.write(svg)  # Write the SVG content to the file

## Named Entity Recognition (NER)

In [None]:
import spacy
from spacy import displacy
import pandas as pd

# Read data
df = pd.read_csv("processed_cleaned_data.csv")
texts = df['text']

# Randomly sample 20 texts
sampled_texts = texts.sample(n=20, random_state=3)  # Set random_state for reproducibility

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_trf")

# Iterate over each sampled text and perform named entity recognition and visualization
for text in sampled_texts:
    doc = nlp(text)
    # Use displacy for visualization
    displacy.render(doc, style="ent")
    print("\n" + "="*50 + "\n")  # Separator for different text outputs