In [None]:
import pandas as pd
import spacy
import re
import unicodedata
from collections import Counter
import ast
import re
from sklearn.model_selection import train_test_split

In [None]:

nlp = spacy.load("en_core_web_trf")

STOP_WORDS = nlp.Defaults.stop_words


def normalize_text(text: str) -> str:
    text = text.lower()
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\bcan't\b", "can not", text)
    text = re.sub(r"\bn't\b", " not", text)
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def extract_lexical_features(tokens: list) -> dict:
    total = len(tokens)
    uniq = len(set(t.text for t in tokens))
    lengths = [len(t.text) for t in tokens]
    freq = Counter(t.text for t in tokens)
    hapax = sum(1 for _, c in freq.items() if c == 1)

    pos_counts = Counter(tok.pos_ for tok in tokens)
    pos_dist = {p: round(pos_counts[p] / total, 3) for p in ["NOUN", "VERB", "ADJ", "ADV"]}

    bigrams = [(tokens[i].pos_, tokens[i+1].pos_) for i in range(total - 1)]
    big6 = Counter(bigrams).most_common(5)

    return {
        "total_tokens": total,
        "unique_tokens": uniq,
        "type_token_ratio": round(uniq / total, 3) if total > 0 else 0,
        "avg_token_length": round(sum(lengths) / total, 3) if total > 0 else 0,
        "hapax_legomena_ratio": round(hapax / total, 3) if total > 0 else 0,
        **{f"pct_{p.lower()}": pos_dist.get(p, 0) for p in pos_dist},
        "top_pos_bigrams": big6
    }

df_full = pd.read_csv("Training_Essay_Data.csv")
df_sample, _ = train_test_split(
    df_full,
    train_size=700,
    stratify=df_full["generated"],
    random_state=42
)
df = df_sample.reset_index(drop=True)

# --------------------------------------------------
#  Parsing, token-level, stats e NER
# --------------------------------------------------
records_tokens = []
records_stats = []
records_entities = []

for idx, row in df.iterrows():
    doc_id = int(idx)
    raw = row["text"]
    label = row["generated"]

    # Normalizza e parsifica
    norm = normalize_text(raw)
    doc = nlp(norm)

    # --- token-level & syntactic features ---
    token_objs = [tok for tok in doc if not tok.is_space]
    for tok in token_objs:
        records_tokens.append({
            "doc_id":      doc_id,
            "label":       label,
            "sentence":    tok.sent.start,
            "position":    tok.i - tok.sent.start,
            "token":       tok.text,
            "lemma":       tok.lemma_,
            "pos":         tok.pos_,
            "tag":         tok.tag_,
            "is_stop":     tok.is_stop,
            "dep":         tok.dep_,
            "head":        tok.head.text,
            "head_pos":    tok.head.pos_,
            "is_root":     tok.dep_ == "ROOT",
            "depth":       len(list(tok.ancestors)),
            "num_children":len(list(tok.children))
        })

    feats = extract_lexical_features(token_objs)
    records_stats.append({
        "doc_id":        doc_id,
        "label":         label,
        "num_sentences": len(list(doc.sents)),
        **feats
    })

    # --- named entity extraction ---
    for ent in doc.ents:
        records_entities.append({
            "doc_id":       doc_id,
            "label":        label,
            "entity_text":  ent.text,
            "entity_label": ent.label_,
            "start_char":   ent.start_char,
            "end_char":     ent.end_char,
            "sentence_idx": ent.sent.start
        })


df_tokens   = pd.DataFrame(records_tokens)
df_stats    = pd.DataFrame(records_stats)
df_entities = pd.DataFrame(records_entities)

df_tokens.to_csv("deep_parsed_tokens.csv", index=False)
df_stats.to_csv("lexical_stats.csv",       index=False)
df_entities.to_csv("named_entities.csv",    index=False)

print("3 file generati: deep_parsed_tokens.csv, lexical_stats.csv, named_entities.csv")

In [None]:
def parse_bigrams(bigram_str):
    try:
        return ast.literal_eval(bigram_str)
    except:
        return []

# Apply parsing if necessary (only if loaded from CSV)
df_stats['top_pos_bigrams'] = df_stats['top_pos_bigrams'].apply(parse_bigrams)

records_bigrams = []

for _, row in df_stats.iterrows():
    doc_id = row['doc_id']
    label = row['label']
    bigrams = row['top_pos_bigrams']  # list of ((POS1, POS2), count)

    for bigram_tuple, count in bigrams:
        pos1, pos2 = bigram_tuple
        pos_bigram_str = f"{pos1}_{pos2}"
        records_bigrams.append({
            'doc_id': doc_id,
            'label': label,
            'pos_bigram': pos_bigram_str,
            'count': count
        })

df_bigrams = pd.DataFrame(records_bigrams)

df_bigrams.to_csv("pos_bigrams.csv", index=False)

In [None]:
df = pd.read_csv("/content/lexical_stats.csv")
df = df.drop(columns=["Unnamed: 0"])
df.to_csv("/content/lexical_stats.csv")
df.to_csv("/content/lexical_stats.csv", index=False)