In [None]:
"""
This script performs a comprehensive analysis of a semantically chunked stream-of-consciousness text dataset 
to gain insights into its structure and characteristics. The analysis focuses on chunk-level statistics, 
identifying outliers, visualizing key metrics, and extracting programmatic features that can be used 
for personality classification tasks.

The script is divided into several sections:

1. Exploratory Data Analysis (EDA):
   - Calculates basic descriptive statistics, including word count, character count, and chunk distribution across authors.
   - Identifies outliers in word count using the Interquartile Range (IQR) method and filters them out for further analysis.
   - Visualizes the distribution of word counts, chunk counts per author, and sentiment scores.

2. Text Preprocessing:
   - Cleans the text by removing stopwords and punctuation, and applies stemming using the Porter Stemmer.
   - Computes lexical diversity metrics, including Type-Token Ratio (TTR) and Measure of Textual Lexical Diversity (MTLD).

3. Sentiment Analysis:
   - Uses the VADER Sentiment Intensity Analyzer to calculate sentiment scores for each text chunk.

4. Programmatic Feature Extraction:
   - Extracts various features from the text using SpaCy and NLTK. These features include:
     - Word and character counts
     - Lexical diversity (TTR and MTLD)
     - Sentiment polarity
     - Personal pronoun usage
     - Part-of-speech ratios (adverbs, pronouns, verbs, nouns)
     - Tense distribution (past, present, future)

5. Feature Export:
   - Formats the extracted features into a structured JSON file for use in the personality classification experiment.

""" 

##Chunks EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud

# --- Load Dataset ---
df_real = pd.read_json('/path/to/full_chunked_local_minima_pass_2_0.40.json', orient='records', lines=True)

# --- Descriptive Statistics ---
df_real['word_count'] = df_real['TEXT'].str.split().str.len()
df_real['char_count'] = df_real['TEXT'].str.len()

chunks_per_author = df_real.groupby('#AUTHID').size()

# Outlier thresholds using IQR
Q1, Q3 = df_real['word_count'].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower_bound, upper_bound = Q1 - 3 * IQR, Q3 + 3 * IQR

# Filter out extreme outliers
df_no_extreme = df_real[(df_real['word_count'] >= lower_bound) & (df_real['word_count'] <= upper_bound)]

# --- Summary Statistics ---
print("=== Chunk Statistics Analysis ===")
print(f"Total Chunks: {len(df_real)}")
print(f"Unique Authors: {df_real['#AUTHID'].nunique()}")
print(f"Average Words per Chunk (no extreme outliers): {df_no_extreme['word_count'].mean():.2f}")
print(f"Median Words per Chunk: {df_no_extreme['word_count'].median():.2f}")
print(f"Standard Deviation: {df_no_extreme['word_count'].std():.2f}")

# --- Outlier Information ---
word_count_outliers = df_real[(df_real['word_count'] < lower_bound) | (df_real['word_count'] > upper_bound)]
print(f"\nOutliers Detected: {len(word_count_outliers)}")
for _, row in word_count_outliers.iterrows():
    print(f"AuthorID: {row['#AUTHID']}, Chunk Number: {row['Chunk Number']}, Words: {row['word_count']}")

# --- Visualizations ---
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['axes.grid'] = False

# Word Count Distribution (Without Outliers)
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=df_no_extreme['word_count'], kde=True, bins=50, color="skyblue", ax=ax)
ax.axvline(df_no_extreme['word_count'].mean(), color='navy', linestyle='--', label='Mean')
ax.set_title("Word Count Distribution (Without Outliers)")
ax.set_xlabel("Words per Chunk")
ax.set_ylabel("Frequency")
ax.legend()
plt.show()

# Chunks per Author Distribution
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=chunks_per_author, kde=True, bins=30, color="skyblue", ax=ax)
ax.axvline(chunks_per_author.mean(), color='navy', linestyle='--', label='Mean')
ax.set_title("Chunks per Author Distribution")
ax.set_xlabel("Chunks")
ax.set_ylabel("Authors")
ax.legend()
plt.show()

# --- Preprocessing ---
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_text(text):
    tokens = [stemmer.stem(word.strip(string.punctuation)) for word in text.split() if word.lower() not in stop_words]
    return " ".join(tokens)

df_real['cleaned_text'] = df_real['TEXT'].apply(preprocess_text)

# --- Lexical Diversity ---
def calculate_ttr(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0

def calculate_mtld(text, threshold=0.72):
    words = text.split()
    segments, segment, unique_words = [], [], set()
    for word in words:
        segment.append(word)
        unique_words.add(word)
        if len(unique_words) / len(segment) < threshold:
            segments.append(len(segment))
            segment, unique_words = [], set()
    return np.mean(segments) if segments else 0

df_real['TTR'] = df_real['cleaned_text'].apply(calculate_ttr)
df_real['MTLD'] = df_real['cleaned_text'].apply(calculate_mtld)

# --- Sentiment Analysis ---
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

df_real['sentiment_score'] = df_real['cleaned_text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Sentiment Score Distribution
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=df_real['sentiment_score'], kde=True, color="skyblue", bins=30, ax=ax)
ax.set_title("Distribution of Sentiment Scores")
ax.set_xlabel("Sentiment Score")
ax.set_ylabel("Frequency")
plt.show()

# --- Word Cloud ---
def generate_word_cloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

# Generate Word Cloud
all_text = " ".join(df_real['cleaned_text'])
generate_word_cloud(all_text, "Word Cloud for All Text")

# --- Export Features ---
export_path = "/path/to/full_programmatic_features_extracted.json"
df_real.to_json(export_path, orient='records', lines=True)
print(f"Features exported to {export_path}")


In [None]:
# Set style parameters
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['axes.grid'] = False

# Create combination column for all traits
df_real['combination'] = df_real[['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']].agg(''.join, axis=1)

# Get combination counts
combination_counts = df_real['combination'].value_counts()

# Create a mapping dictionary for trait names
trait_name_map = {
    'cOPN': 'Openness to Experience',
    'cCON': 'Conscientiousness',
    'cEXT': 'Extraversion',
    'cAGR': 'Agreeableness',
    'cNEU': 'Neuroticism',
}

personality_traits = ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']
trait_proportions = pd.DataFrame()

# Calculate proportions for each trait
for trait in personality_traits:
    counts = df_real[trait].value_counts(normalize=True)
    trait_proportions[trait_name_map[trait]] = counts

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Plot 1: Distribution of Personality Trait Combinations
ax1 = axes[0]
combination_counts.plot(kind='bar', ax=ax1, color='mediumaquamarine', alpha=1.0)
ax1.set_title('Distribution of Personality Trait Combinations', fontsize=14)
ax1.set_xlabel('Trait Combinations (cOPN, cCON, cEXT, cAGR, cNEU)')
ax1.set_ylabel('Frequency')
ax1.text(0.98, 0.95, f"n = {len(df_real)}", transform=ax1.transAxes, ha="right", va="top", fontsize=10,
         bbox=dict(facecolor="white", alpha=0.8, boxstyle="round"))
ax1.tick_params(axis='x', rotation=45)

# Add value labels on top of bars
for i, v in enumerate(combination_counts):
    ax1.text(i, v, str(v), ha='center', va='bottom')

# Plot 2: Distribution of Binary Labels for Big Five Personality Traits
ax2 = axes[1]
trait_proportions.T.plot(kind='bar', stacked=True, ax=ax2,
                         color=['lightcoral', 'lightgreen'], alpha=1.0)
ax2.set_title('Distribution of Binary Labels for Big Five Personality Traits', fontsize=14)
ax2.set_xlabel('Personality Trait')
ax2.set_ylabel('Proportion')
ax2.tick_params(axis='x', rotation=0)

# Adjust legend position and style
ax2.legend(['No', 'Yes'], title='Class', bbox_to_anchor=(1.05, 1), loc='upper left')

# Add n = annotation
ax2.text(0.98, 0.95, f"n = {len(df_real)}", transform=ax2.transAxes, ha="right", va="top", fontsize=10,
         bbox=dict(facecolor="white", alpha=0.8, boxstyle="round"))

# Add percentage labels
for c in ax2.containers:
    ax2.bar_label(c, fmt='%.2f', label_type='center')

# Adjust layout
plt.tight_layout()
plt.show()

# Print numerical summaries
print("\nIndividual Trait Distributions:")
for trait in personality_traits:
    counts = df_real[trait].value_counts()
    percentages = df_real[trait].value_counts(normalize=True) * 100
    print(f"\n{trait}:")
    for label in counts.index:
        print(f"{label}: {counts[label]} ({percentages[label]:.2f}%)")

#Programatic Features

In [None]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
from collections import Counter
import spacy
import json

# --- Load Dataset ---
df_real = pd.read_json('/path/to/full_chunked_local_minima_pass_2_0.40.json', orient='records', lines=True)

# --- Ensure necessary downloads and load models ---
nltk.download('vader_lexicon', quiet=True)
nlp = spacy.load("en_core_web_lg")
sid = SentimentIntensityAnalyzer()

# --- Utility Functions ---
def clean_text(text):
    text = text.lower()
    return re.sub(r'\s+', ' ', text).strip()

def extract_features(text):
    text = clean_text(text)
    doc = nlp(text)

    tokens = [token for token in doc if not token.is_space and not token.is_punct]
    word_count = len(tokens)
    char_count = sum(len(t.text) for t in tokens)

    unique_tokens = set(t.text for t in tokens)
    ttr = len(unique_tokens) / word_count if word_count > 0 else 0.0

    personal_pronouns = {"i", "me", "my", "mine", "we", "us", "our", "ours"}
    pp_count = sum(1 for t in tokens if t.text.lower() in personal_pronouns)

    sentiment_scores = sid.polarity_scores(text)
    polarity = sentiment_scores["compound"]

    pos_counts = Counter(t.pos_ for t in tokens)
    total_tokens_for_pos = sum(pos_counts.values())
    pos_ratios = {pos.lower(): count / total_tokens_for_pos for pos, count in pos_counts.items()}

    tokens_tags = [(token.text.lower(), token.tag_) for token in tokens]
    past_verbs = sum(1 for word, tag in tokens_tags if tag in ["VBD", "VBN"])
    present_verbs = sum(1 for word, tag in tokens_tags if tag in ["VB", "VBP", "VBZ"])
    future_verbs = sum(1 for i, (word, tag) in enumerate(tokens_tags) if word in ["will", "shall"] and i + 1 < len(tokens_tags) and tokens_tags[i + 1][1] == "VB")

    total_verbs = max(1, past_verbs + present_verbs + future_verbs)
    tense_past_ratio = past_verbs / total_verbs
    tense_present_ratio = present_verbs / total_verbs
    tense_future_ratio = future_verbs / total_verbs

    features = {
        "word_count": word_count,
        "char_count": char_count,
        "type_token_ratio": ttr,
        "personal_pronoun_count": pp_count,
        "sentiment_polarity": polarity,
        "tense_past_ratio": tense_past_ratio,
        "tense_present_ratio": tense_present_ratio,
        "tense_future_ratio": tense_future_ratio,
    }

    for pos_tag in ["adv", "pron", "verb", "noun"]: # Mistake of not including all POS tags
        features[f"pos_{pos_tag}_ratio"] = pos_ratios.get(pos_tag, 0.0)

    return features

# --- Feature Extraction ---
feature_rows = []
for _, row in tqdm(df_real.iterrows(), total=len(df_real), desc="Extracting Features"):
    text = row['TEXT']
    feats = extract_features(text)
    feats_row = {
        "#AUTHID": row['#AUTHID'],
        "Chunk Number": row['Chunk Number'],
        "cEXT": row['cEXT'],
        "cNEU": row['cNEU'],
        "cAGR": row['cAGR'],
        "cCON": row['cCON'],
        "cOPN": row['cOPN'],
    }
    feats_row.update(feats)
    feature_rows.append(feats_row)

# --- Convert to DataFrame ---
df_features = pd.DataFrame(feature_rows)

# --- Post-processing / Normalization ---
df_features['personal_pronoun_percentage'] = np.where(
    df_features['word_count'] > 0,
    (df_features['personal_pronoun_count'] / df_features['word_count']) * 100,
    np.nan
)

for col in ['pos_adv_ratio', 'pos_pron_ratio', 'pos_verb_ratio', 'pos_noun_ratio']:
    df_features[col.replace('_ratio', '_pct')] = df_features[col] * 100

for col in ['tense_past_ratio', 'tense_present_ratio', 'tense_future_ratio']:
    df_features[col.replace('_ratio', '_pct')] = df_features[col] * 100

# --- Formatting ---
def format_features(row):
    return (
        f"Word Count: {row['word_count']}, "
        f"Char Count: {row['char_count']}, "
        f"Lexical Diversity (TTR): {row['type_token_ratio']:.2f}, "
        f"Personal Pronouns (as percentage of words): {row['personal_pronoun_percentage']:.1f}%, "
        f"Sentiment Polarity Score (VADER): {row['sentiment_polarity']:.3f}, "
        f"POS Distribution (as percentage of all tokens): [Adv: {row['pos_adv_pct']:.1f}%, Pron: {row['pos_pron_pct']:.1f}%, "
        f"Verb: {row['pos_verb_pct']:.1f}%, Noun: {row['pos_noun_pct']:.1f}%], "
        f"Tense Distribution (derived from verb tags): [Past: {row['tense_past_pct']:.1f}%, Present: {row['tense_present_pct']:.1f}%, Future: {row['tense_future_pct']:.1f}%]"
    )

df_features['features_text'] = df_features.apply(format_features, axis=1)

# --- Export to JSON ---
export_path = "/path/to/full_programmatic_features_extracted.json"
records = df_features[['#AUTHID', 'Chunk Number', 'features_text']].to_dict(orient='records')
with open(export_path, 'w', encoding='utf-8') as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

print(f"Features exported successfully to {export_path}")
