In [13]:
import pandas as pd
import re
import nltk
import openai
import json
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import time

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize
lemmatizer = WordNetLemmatizer()

# === Step 0: Load the Base Dictionary from CSV ===
def load_dictionary_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    theme_dict = {}
    for theme in df['theme'].unique():
        theme_dict[theme] = df[df['theme'] == theme]['item'].dropna().tolist()
    return theme_dict

# === Step 1: Cleaning & Lemmatizing (Single Words Only) ===
def clean_and_lemmatize_single_words(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Filter out very short words (optional)
    lemmatized = [word for word in lemmatized if len(word) > 1]
    
    return lemmatized

def clean_and_lemmatize(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized)

# === Step 2: Synonym Grouping ===
def find_synonyms(word):
    if ' ' in word:
        return []
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            syn_name = lemma.name().replace('_', ' ')
            if syn_name != word:
                synonyms.add(syn_name)
    return list(synonyms)

def group_synonyms(word_list):
    synonym_groups = {}
    processed_words = set()
    
    for word in word_list:
        if word in processed_words:
            continue
        if ' ' not in word:
            syns = find_synonyms(word)
            group = [word]
            for syn in syns:
                if syn in word_list and syn != word:
                    group.append(syn)
                    processed_words.add(syn)
            synonym_groups[word] = group
        else:
            synonym_groups[word] = [word]
        processed_words.add(word)
    
    return synonym_groups

# === Step 3: Embedding Functions ===
def get_embedding(text, model_name="text-embedding-ada-002"):
    try:
        response = openai.Embedding.create(input=[text], model=model_name)
        return response['data'][0]['embedding']
    except Exception as e:
        if "Rate limit" in str(e):
            print(f"Rate limit hit, waiting 1 second before retrying...")
            time.sleep(1)  # Wait 1 second before retrying
            try:
                response = openai.Embedding.create(input=[text], model=model_name)
                return response['data'][0]['embedding']
            except Exception as retry_e:
                print(f"Error embedding '{text}' after retry: {retry_e}")
                return None
        else:
            print(f"Error embedding '{text}': {e}")
            return None

# === Step 4: Find Unique Words/Phrases ===
def extract_unique_items(all_items, existing_dict):
    existing_items = set()
    for theme_items in existing_dict.values():
        existing_items.update(theme_items)
    return list(set(all_items) - existing_items)

# === Step 5: Assign New Words to Themes ===
def assign_to_theme(new_items, theme_dict, threshold=0.8, model_name="text-embedding-ada-002"):
    theme_embeddings = {}

    for theme, items in theme_dict.items():
        item_embeddings = []
        for item in items:
            emb = get_embedding(item, model_name)
            if emb is not None:
                item_embeddings.append(emb)
        if item_embeddings:
            theme_embeddings[theme] = np.mean(item_embeddings, axis=0)
        else:
            print(f"No valid embeddings for theme {theme}")

    assignments = {}
    for item in tqdm(new_items, desc="Assigning new words/phrases"):
        item_emb = get_embedding(item, model_name)
        if item_emb is None:
            continue
        
        sims = {}
        for theme, emb in theme_embeddings.items():
            sims[theme] = cosine_similarity(
                np.array(item_emb).reshape(1, -1),
                np.array(emb).reshape(1, -1)
            )[0][0]
        
        best_theme = max(sims, key=sims.get)
        if sims[best_theme] >= threshold:
            assignments[item] = best_theme

    return assignments

# === Step 6: Update Dictionary ===
def update_dictionary(original_dict, new_assignments):
    updated_dict = original_dict.copy()
    for item, theme in new_assignments.items():
        updated_dict[theme].append(item)
    return updated_dict

# === Step 7: Word Counting Per Theme with Original Word Count ===
def count_items_per_theme(texts, original_descriptions, theme_dict):
    results = []
    for idx, (text_items, orig_desc) in enumerate(zip(texts, original_descriptions)):
        # Get original word count
        orig_tokens = word_tokenize(orig_desc)
        orig_word_count = len(orig_tokens)
        
        # Count themed word occurrences
        theme_counts = {theme: 0 for theme in theme_dict}
        for item in text_items:
            for theme, theme_items in theme_dict.items():
                if item in theme_items:
                    theme_counts[theme] += 1
                    break
        
        row = {
            "host_id": idx, 
            "total_items": len(text_items),
            "original_word_count": orig_word_count
        }
        
        for theme in theme_dict:
            row[f"count_{theme}"] = theme_counts[theme]
            # Calculate proportions based on original word count
            row[f"prop_{theme}"] = theme_counts[theme] / orig_word_count if orig_word_count > 0 else 0
        
        results.append(row)
    
    return pd.DataFrame(results)

# === Step 8: Main ===
def main(dictionary_csv_path, input_csv, input_column, output_clean_csv, output_dict_csv, output_count_csv,
         openai_api_key, use_synonyms=True, embedding_model="text-embedding-ada-002"):
    
    openai.api_key = openai_api_key

    base_dict = load_dictionary_from_csv(dictionary_csv_path)

    df = pd.read_csv(input_csv)
    descriptions = df[input_column].dropna().tolist()

    # Use single word approach
    processed_texts = [clean_and_lemmatize_single_words(desc) for desc in descriptions]

    pd.DataFrame({
        "host_id": range(len(processed_texts)),
        "processed_text": [' '.join(items) for items in processed_texts]
    }).to_csv(output_clean_csv, index=False)

    all_unique_items = list(set([item for text in processed_texts for item in text]))

    if use_synonyms:
        synonym_groups = group_synonyms(all_unique_items)
        synonym_map = {}
        for rep_word, synonyms in synonym_groups.items():
            for syn in synonyms:
                synonym_map[syn] = rep_word
        for i, text in enumerate(processed_texts):
            processed_texts[i] = [synonym_map.get(item, item) for item in text]
        all_unique_items = list(set([item for text in processed_texts for item in text]))

    new_items = extract_unique_items(all_unique_items, base_dict)
    new_assignments = assign_to_theme(new_items, base_dict, model_name=embedding_model)
    final_dict = update_dictionary(base_dict, new_assignments)

    dict_rows = []
    for theme, items in final_dict.items():
        for item in items:
            dict_rows.append({"theme": theme, "item": item})
    pd.DataFrame(dict_rows).to_csv(output_dict_csv, index=False)

    # Use original descriptions for proportions
    count_df = count_items_per_theme(processed_texts, descriptions, final_dict)
    count_df.to_csv(output_count_csv, index=False)

    return {
        "base_dict": base_dict,
        "final_dict": final_dict,
        "new_assignments": new_assignments,
        "counts": count_df
    }

# === Run
if __name__ == "__main__":
    result = main(
         dictionary_csv_path="dictionary.csv",
         input_csv="host_descriptions.csv",
         input_column="host_about",
         output_clean_csv="cleaned_host_descriptions.csv",
         output_dict_csv="final_theme_dictionary.csv",
         output_count_csv="theme_counts_per_host.csv",
         #openai_api_key="",  # replace with your key
         use_synonyms=True,
         embedding_model="text-embedding-ada-002"
    )

    print("✅ Processing completed!")
    print(f"✅ Base dictionary themes: {list(result['base_dict'].keys())}")
    print(f"✅ New assignments: {len(result['new_assignments'])}")
    print(f"✅ Final counts shape: {result['counts'].shape}")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/helmadevina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/helmadevina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/helmadevina/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


TypeError: main() missing 1 required positional argument: 'openai_api_key'

In [10]:
import pandas as pd
df = pd.read_csv('dictionary.csv')
print(df.columns)  # See what columns actually exist

Index(['theme', 'item'], dtype='object')


In [6]:
docx_dictionary_path="dictionary-raw.docx"
base_dict = parse_word_file(docx_dictionary_path)
print(json.dumps(base_dict, indent=2))


{
  "showing_personal_charisma": [],
  "presenting_friendly_attitudes": [],
  "promoting_socialisation_and_sharing": [],
  "offering_service_and_assistance": []
}
