In [None]:
%pip install pandas

In [22]:
import json
import csv
import os
import random
import string
import pandas as pd
import numpy as np
from datetime import datetime

## Raw files to .csv:

In [28]:
# Define the directory containing the input files
input_dir = 'transcript'

def process_txt_file(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        csv_writer = csv.writer(outfile)
        
        header = ['time', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness','Surprise','Neutral']
        csv_writer.writerow(header)
        
        for line in infile:
            data = json.loads(line.strip())
            row = [int(data['time'])] + [int(score)//1e4 for score in data['scores']]
            csv_writer.writerow(row)

# Function to recursively process nested content
def process_content(csv_writer, content):
    if isinstance(content, list):
        for item in content:
            process_content(csv_writer, item)
    elif isinstance(content, dict):
        role = content.get('role', '')
        content_text = ''
        time = int(content.get('time', '0'))
        user_id = content.get('user_id', '')
        if 'content' in content:
            content_data = content['content']
            if isinstance(content_data, list):
                for sub_item in content_data:
                    if isinstance(sub_item, dict) and sub_item.get('type') == 'text':
                        content_text = sub_item.get('text', '')
            elif isinstance(content_data, str):
                content_text = content_data
        csv_writer.writerow([role, content_text, time, user_id])

# Function to process .json files and convert them to CSV
def process_json_file(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        csv_writer = csv.writer(outfile)
        
        header = ['role', 'content', 'time', 'user_id']
        csv_writer.writerow(header)
        
        data_list = json.load(infile)
        
        for item in data_list:
            process_content(csv_writer, item)

# Walk through all directories and files
for root, dirs, files in os.walk(input_dir):
    for filename in files:
        if filename.endswith('.txt'):
            input_file = os.path.join(root, filename)
            output_file = os.path.join(root, 'processed_' + filename.replace('.txt', '.csv'))
            process_txt_file(input_file, output_file)
        elif filename.endswith('.json') and not filename.startswith(('pre', 'post', 'chat')):
            input_file = os.path.join(root, filename)
            output_file = os.path.join(root, 'processed_' + filename.replace('.json', '.csv'))
            process_json_file(input_file, output_file)


## Calculating pre-VAD avg emotion scores

In [37]:
import csv
import os
import random
import string
import json
from datetime import datetime, timedelta

EMOTION_NAMES = ["Anger", "Disgust", "Fear", "Happiness", "Sadness", "Surprise", "Neutral"]

def read_csv_file(file_path):
    data = []
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    return data

def calculate_average_scores(raw_scores, target_time, time_window):
    sum_scores = {emotion: 0 for emotion in EMOTION_NAMES}
    count = 0
    for score_data in raw_scores:
        score_time = int(score_data['time'])
        if abs(score_time - target_time) <= time_window:
            for emotion in EMOTION_NAMES:
                sum_scores[emotion] += float(score_data[emotion])
            count += 1
    
    if count > 0:
        avg_scores = {emotion: round(sum_score / count, 2) for emotion, sum_score in sum_scores.items()}
    else:
        avg_scores = {emotion: 0 for emotion in EMOTION_NAMES}
    return avg_scores

def get_relative_time_ms(start_time_str, time_str):
    start_time = datetime.strptime(start_time_str, "%Y%m%d_%H%M%S")
    current_time = datetime.strptime(time_str, "%Y%m%d_%H%M%S")
    time_diff = current_time - start_time
    return int(time_diff.total_seconds() * 1000)

def process_folder(root):
    print(f"Processing folder: {root}")
    user_data_csv = None
    raw_scores_csv = None
    pre_chat_json = None
    post_chat_json = None

    for filename in os.listdir(root):
        if filename.startswith('processed_Emili_') and filename.endswith('.csv') and not filename.endswith('_condensed.csv'):
            if filename.startswith('processed_Emili_raw_'):
                raw_scores_csv = os.path.join(root, filename)
            else:
                user_data_csv = os.path.join(root, filename)
        elif filename.startswith('pre_chat_emotions_'):
            pre_chat_json = os.path.join(root, filename)
        elif filename.startswith('post_chat_emotions_'):
            post_chat_json = os.path.join(root, filename)

    print(f"Found files: user_data_csv: {user_data_csv}, raw_scores_csv: {raw_scores_csv}, pre_chat_json: {pre_chat_json}, post_chat_json: {post_chat_json}")

    if user_data_csv and raw_scores_csv and pre_chat_json and post_chat_json:
        try:
            user_data = read_csv_file(user_data_csv)
            raw_scores_data = read_csv_file(raw_scores_csv)
            
            print(f"Number of rows in user_data: {len(user_data)}")
            print(f"Number of rows in raw_scores_data: {len(raw_scores_data)}")
            
            if len(raw_scores_data) == 0:
                print("Error: raw_scores_data is empty")
                return

            with open(pre_chat_json, 'r') as f:
                pre_chat_data = json.load(f)
            with open(post_chat_json, 'r') as f:
                post_chat_data = json.load(f)

            # Get the start time from the filename
            start_time_str = os.path.basename(user_data_csv).split('_')[2] + '_' + os.path.basename(user_data_csv).split('_')[3].replace('.csv', '')

            # Calculate relative times
            pre_chat_relative_time = get_relative_time_ms(start_time_str, pre_chat_data['submission_time'])
            post_chat_relative_time = get_relative_time_ms(start_time_str, post_chat_data['submission_time'])

            print(f"Pre-chat relative time: {pre_chat_relative_time}, Post-chat relative time: {post_chat_relative_time}")

            pre_chat_scores = calculate_average_scores(raw_scores_data, pre_chat_relative_time, 10000)  # 10 second window
            post_chat_scores = calculate_average_scores(raw_scores_data, post_chat_relative_time, 10000)  # 10 second window

            output_header = ['Conv_id', 'time', 'role', 'user_id', 'content'] + [f'Pre_{emotion}' for emotion in EMOTION_NAMES] + [f'Post_{emotion}' for emotion in EMOTION_NAMES]
            output_data = []

            conv_id = ''.join(random.choices(string.ascii_letters + string.digits, k=7))

            for user_row in user_data:
                output_row = [conv_id, user_row['time'], user_row['role'], user_row['user_id'], user_row['content']]
                output_row += [pre_chat_scores[emotion] for emotion in EMOTION_NAMES]
                output_row += [post_chat_scores[emotion] for emotion in EMOTION_NAMES]
                output_data.append(output_row)

            output_csv = os.path.join(root, f'scored_{start_time_str}.csv')
            with open(output_csv, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(output_header)
                writer.writerows(output_data)

            print(f"Processed data saved to {output_csv}")
            print("Pre-chat average scores:")
            for emotion, score in pre_chat_scores.items():
                print(f"{emotion}: {score}")
            print("\nPost-chat average scores:")
            for emotion, score in post_chat_scores.items():
                print(f"{emotion}: {score}")

        except Exception as e:
            print(f"Error processing folder {root}: {str(e)}")
            print(f"Raw scores file content (first 5 rows):")
            try:
                with open(raw_scores_csv, 'r') as f:
                    for i, line in enumerate(f):
                        if i < 5:
                            print(line.strip())
                        else:
                            break
            except Exception as file_error:
                print(f"Error reading raw scores file: {str(file_error)}")

    else:
        print("Required files not found in this folder.")
        print(f"Missing files: user_data_csv: {'Not found' if not user_data_csv else 'Found'}, "
              f"raw_scores_csv: {'Not found' if not raw_scores_csv else 'Found'}, "
              f"pre_chat_json: {'Not found' if not pre_chat_json else 'Found'}, "
              f"post_chat_json: {'Not found' if not post_chat_json else 'Found'}")

# Main execution
input_dir = 'transcript'
for root, dirs, files in os.walk(input_dir):
    process_folder(root)

Processing folder: transcript
Found files: user_data_csv: None, raw_scores_csv: None, pre_chat_json: None, post_chat_json: None
Required files not found in this folder.
Missing files: user_data_csv: Not found, raw_scores_csv: Not found, pre_chat_json: Not found, post_chat_json: Not found
Processing folder: transcript/20241010_111658
Found files: user_data_csv: transcript/20241010_111658/processed_Emili_20241010_111658.csv, raw_scores_csv: transcript/20241010_111658/processed_Emili_raw_20241010_111658.csv, pre_chat_json: transcript/20241010_111658/pre_chat_emotions_20241010_111658.json, post_chat_json: transcript/20241010_111658/post_chat_emotions_20241010_111658.json
Number of rows in user_data: 15
Number of rows in raw_scores_data: 897
Pre-chat relative time: 0, Post-chat relative time: 73000
Processed data saved to transcript/20241010_111658/scored_20241010_111658.csv
Pre-chat average scores:
Anger: 9.08
Disgust: 0.0
Fear: 10.5
Happiness: 16.62
Sadness: 7.99
Surprise: 3.74
Neutral: 4

## Calculating Temporal Difference and the flag

In [None]:
def process_scored_files_in_directory(directory):
    # Iterate through files in the directory
    for filename in os.listdir(directory):
        if filename.startswith('scored_') and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Compute tot_emo_score
            df['tot_emo_score'] = df['Happy'] * 5 + df['Neutral'] * 1 - df['Sad'] * 2 + df['Surprise'] * 1 - df['Anger'] * 2 - df['Fear'] * 2 - df['Disgust'] * 5
            
            # Initialize flag column with NaNs
            df['flag'] = np.nan
            
            # Explicitly cast flag column to boolean
            df['flag'] = df['flag'].astype('object')
            
            # Iterate through each row with role 'assistant'
            for idx, row in df.iterrows():
                if row['role'] == 'assistant':
                    # Search backwards to find the previous row with role 'user'
                    prev_user_idx = idx - 1
                    while prev_user_idx >= 0 and df.iloc[prev_user_idx]['role'] != 'user':
                        prev_user_idx -= 1
                    
                    # Check if a valid previous 'user' row was found
                    if prev_user_idx >= 0 and df.iloc[prev_user_idx]['role'] == 'user':
                        if row['tot_emo_score'] - df.iloc[prev_user_idx]['tot_emo_score'] >= 0:
                            df.at[idx, 'flag'] = True
                        else:
                            df.at[idx, 'flag'] = False
            
            # Extract file timestamp from filename
            timestamp = filename.split('_')[1].split('.')[0]  # Adjust this based on your filename pattern
            
            # Save the modified DataFrame to a new CSV file
            output_filename = f'flagged_{timestamp}.csv'
            output_path = os.path.join(directory, output_filename)
            df.to_csv(output_path, index=False)
            
            print(f"Processed data saved to {output_path}")

# Define the main directory to process
main_directory = 'transcript'

# Iterate through each directory in the main directory
for root, dirs, files in os.walk(main_directory):
    for directory in dirs:
        directory_path = os.path.join(root, directory)
        process_scored_files_in_directory(directory_path)

print("Processing complete for all directories.")


# CSV to JSONL

In [None]:
def process_csv_file(csv_file_path, messages):
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        system_message = ""

        for row in csv_reader:
            id = ''

            if row["role"] == "user":
                id = f"user_id: {row['user_id']}. "
            # Process the content without the column name
            content = id + row["content"].replace('\n', ' ').replace('\r', ' ').strip()

            if row["role"] == "system":
                system_message += content + " "
                continue

            if system_message:
                messages.append({"role": "system", "content": system_message.strip()})
                system_message = ""

            message = {
                "role": row["role"],
                "content": content
            }
            if row["role"] == "assistant":
                message["weight"] = 1 if row["flag"].lower() == "true" else 0

            messages.append(message)

        if system_message:
            messages.append({"role": "system", "content": system_message.strip()})


def csv_to_jsonl(input_dir, jsonl_file_path):
    messages = []

    for root, dirs, files in os.walk(input_dir):
        for filename in files:
            if filename.startswith('flagged_') and filename.endswith('.csv') and not filename.endswith('_condensed.csv'):
                csv_file_path = os.path.join(root, filename)
                process_csv_file(csv_file_path, messages)

    with open(jsonl_file_path, 'w') as jsonl_file:
        jsonl_file.write(json.dumps({"messages": messages}) + '\n')


if __name__ == "__main__":
    input_dir = 'test_script'  # Replace with the path to your directory
    jsonl_file_path = 'Dataset_1.jsonl'  # Replace with the desired output JSONL file path
    csv_to_jsonl(input_dir, jsonl_file_path)


## Calculating avg emotion scores (Experiment 1 version)

In [None]:
# Function to read CSV file into a list of dictionaries
def read_csv_file(file_path):
    data = []
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    return data

# Function to write data to a new CSV file
def write_csv_file(file_path, header, data):
    with open(file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header)
        for row in data:
            writer.writerow(row)

# Function to generate random alphanumeric ID of given length
def generate_random_id(length):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Function to calculate average scores for a given time range
def calculate_average_scores(raw_scores, target_time, role, time_window):
    num_emotions = 7
    sum_scores = [0] * num_emotions
    count = 0

    for score_data in raw_scores:
        score_time = int(score_data['time'])

        if role == 'user':
            if target_time - time_window <= score_time <= target_time + time_window:
                for i in range(num_emotions):
                    sum_scores[i] += int(score_data[f'emotion_{i+1}'])
                count += 1
        elif role == 'assistant':
            if target_time <= score_time <= target_time + time_window:
                for i in range(num_emotions):
                    sum_scores[i] += int(score_data[f'emotion_{i+1}'])
                count += 1
    
    if count > 0:
        avg_scores = [round(sum_score / count, 2) for sum_score in sum_scores]
    else:
        avg_scores = [0] * num_emotions
    
    return avg_scores

# Function to process each folder and its files
def process_folder(root):
    print(f"Processing folder: {root}")
    user_data_csv = None
    raw_scores_csv = None
    output_csv = None

    used_ids = set()  # Set to store used conv_ids

    for filename in os.listdir(root):
        if filename.startswith('processed_Emili_') and filename.endswith('.csv') and not filename.endswith('_condensed.csv'):
            if filename.startswith('processed_Emili_raw_'):
                raw_scores_csv = os.path.join(root, filename)
            else:
                user_data_csv = os.path.join(root, filename)

    if user_data_csv and raw_scores_csv:
        timestamp = os.path.basename(user_data_csv).split('_')[2]
        output_csv = os.path.join(root, f'scored_{timestamp}.csv')
        print(f"Found user data CSV: {user_data_csv}")
        print(f"Found raw scores CSV: {raw_scores_csv}")
        user_data = read_csv_file(user_data_csv)
        raw_scores_data = read_csv_file(raw_scores_csv)

        output_header = ['Conv_id','time', 'role','user_id','content','Anger', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
        output_data = []

        # Generate unique conv_id
        conv_id = generate_random_id(7)
        while conv_id in used_ids:
            conv_id = generate_random_id(7)
        used_ids.add(conv_id)

        for user_row in user_data:
            if user_row['role'] == 'user':
                time_window = 5
                target_time = int(user_row['time'])
                avg_scores = calculate_average_scores(raw_scores_data, target_time, user_row['role'], time_window)
                output_row = [conv_id,user_row['time'],user_row['role'], user_row['user_id'], user_row['content']] + avg_scores
                output_data.append(output_row)
            elif user_row['role'] == 'assistant':
                time_window = 10
                target_time = int(user_row['time'])
                avg_scores = calculate_average_scores(raw_scores_data, target_time, user_row['role'], time_window)
                output_row = [conv_id,user_row['time'],user_row['role'], user_row['user_id'], user_row['content']] + avg_scores
                output_data.append(output_row)
            elif user_row['role'] == 'system':
                output_row = [conv_id,user_row['time'],user_row['role'], user_row['user_id'], user_row['content']] + [0] * 7
                output_data.append(output_row)
        write_csv_file(output_csv, output_header, output_data)
        print(f"Processed data saved to {output_csv}")
    else:
        print("Required CSV files not found in this folder.")

# Walk through all directories and process files
input_dir = 'transcript'
for root, dirs, files in os.walk(input_dir):
    process_folder(root)


# Generating the lookup table

In [None]:
import pandas as pd

def load_nrc_vad_lexicon(file_path):
    vad_lexicon = {}
    with open(file_path, 'r', newline='', encoding='utf-8') as file:
        for line in file:
            word, valence, arousal, dominance = line.strip().split('\t')
            vad_lexicon[word.lower()] = {
                'V': float(valence),
                'A': float(arousal),
                'D': float(dominance)
            }
    return vad_lexicon

# Load the lexicon
lexicon_path = 'NRC-VAD-Lexicon.txt'  # Make sure this path is correct
nrc_vad_lexicon = load_nrc_vad_lexicon(lexicon_path)

# Example usage
emotions = [
    "Surprised", "Excited", "Angry", "Proud", "Sad", "Annoyed", "Grateful", "Lonely",
    "Afraid", "Terrified", "Guilty", "Impressed", "Disgusted", "Hopeful", "Confident",
    "Furious", "Anxious", "Anticipating", "Joyful", "Nostalgic", "Disappointed",
    "Prepared", "Jealous", "Content", "Devastated", "Embarrassed", "Caring",
    "Sentimental", "Trusting", "Ashamed", "Apprehensive", "Faithful"
]

lookup_table = []
for emotion in emotions:
    emotion_lower = emotion.lower()
    if emotion_lower in nrc_vad_lexicon:
        lookup_table.append({
            'Emotion': emotion,
            'Valence': nrc_vad_lexicon[emotion_lower]['V'],
            'Arousal': nrc_vad_lexicon[emotion_lower]['A'],
            'Dominance': nrc_vad_lexicon[emotion_lower]['D']
        })
    else:
        print(f"Warning: {emotion} not found in the lexicon.")

lookup_df = pd.DataFrame(lookup_table)
print(lookup_df)
lookup_df.to_csv('lookup_table.csv', index=False)


# Creating the regression dataset 

In [13]:
from utils.VAD_approx import load_vad_lexicon
from utils.VAD_approx import load_lexicon_embeddings
from utils.VAD_approx import VAD_with_embeddings

In [18]:
vad_lexicon = load_vad_lexicon("utils/NRC-VAD-Lexicon.txt") 
file_pathlexicon_embeddings = "utils/lexicon_embeddings_sample.json"
# Check if the file exists
if os.path.exists(file_pathlexicon_embeddings):
    print(f"File found at {file_pathlexicon_embeddings}")
else:
    print(f"File not found at {file_pathlexicon_embeddings}")

lexicon_embeddings = load_lexicon_embeddings(file_pathlexicon_embeddings) 
vad_vector = VAD_with_embeddings("Happy", lexicon_embeddings, vad_lexicon, 5)

File found at utils/lexicon_embeddings_sample.json
Looking up word: Happy
Similarities calculated: [0.79061272 0.75077381 0.78861974 0.74839647 0.77166452]
Top-N closest words: ['historic', 'hidden', 'cute', 'fits', 'sunny']


In [21]:
print(type(vad_vector))
print(vad_vector[0])

<class 'numpy.ndarray'>
0.651401029134241


In [6]:
def process_chat_emotions(chat_file:json, prechat_flag: bool ):
    '''
    Function to process the chat emotions and calculate the average VAD scores.

    Args:
        pre_chat_file: Path to the chat survey emotions JSON file.
        prechat_flag: Boolean flag indicating whether pre-chat desired scores are available.
    
    Returns:   
        chat_current_scores: Dictionary containing the average VAD scores for the current chat emotions.
        pre_chat_desired_scores: Dictionary containing the average VAD scores for the pre-chat desired emotions.
        
    '''
    chat_data = json.load(open(chat_file, 'r'))
    chat_emotions = chat_data["current"]
    chat_current_scores = {'Valence':0, 'Arousal':0, 'Dominance':0}
    num_emotions = len(chat_emotions)
    for emotion in chat_emotions:
        VAD_vector = VAD_with_embeddings(emotion, lexicon_embeddings, vad_lexicon, 5)
        chat_current_scores['Valence'] += VAD_vector[0]
        chat_current_scores['Arousal'] += VAD_vector[1]
        chat_current_scores['Dominance'] += VAD_vector[2]  

    chat_current_scores['Valence'] /= num_emotions
    chat_current_scores['Arousal'] /= num_emotions
    chat_current_scores['Dominance'] /= num_emotions
    if prechat_flag:
        pre_chat_desired = chat_data["desired"]
        pre_chat_desired_scores = {'Valence':0, 'Arousal':0, 'Dominance':0}
        for emotion in pre_chat_desired:
            VAD_vector = VAD_with_embeddings(emotion, lexicon_embeddings, vad_lexicon, 5)
            pre_chat_desired_scores['Valence'] += VAD_vector[0]
            pre_chat_desired_scores['Arousal'] += VAD_vector[1]
            pre_chat_desired_scores['Dominance'] += VAD_vector[2]

        pre_chat_desired_scores['Valence'] /= num_emotions
        pre_chat_desired_scores['Arousal'] /= num_emotions
        pre_chat_desired_scores['Dominance'] /= num_emotions

        return chat_current_scores, pre_chat_desired_scores
    
    return chat_current_scores


In [11]:
import os
import pandas as pd
import json
from datetime import datetime

lookup_df = pd.read_csv('lookup_table.csv')

def generate_regression_data(input_dir):
    '''
    Function to generate the regression data for the given input directory.
    '''
    regression_data = pd.DataFrame(columns=['conv_id', 'user_id', 'Pre_Valence', 'Pre_Arousal', 'Pre_Dominance',
                                            'Post_Valence', 'Post_Arousal', 'Post_Dominance', 'Anger', 'Disgust', 
                                            'Fear', 'Happiness', 'Sadness', 'Surprise', 'Neutral', 'Helpfulness', 
                                            'Repetitiveness', 'Intent'])
    
    for root, dirs, files in os.walk(input_dir):
        pre_chat_file = next((os.path.join(root, f) for f in files if f.startswith('pre_chat_') and f.endswith('.json')), None)
        post_chat_file = next((os.path.join(root, f) for f in files if f.startswith('post_chat_') and f.endswith('.json')), None)
        scored_file = next((os.path.join(root, f) for f in files if f.startswith('scored_') and f.endswith('.csv')), None)
        
        eval_file = next((os.path.join(root, f) for f in files if f.startswith('chat_evaluation_') and f.endswith('.json')), None)
        
        if pre_chat_file and post_chat_file and scored_file and eval_file:
            try:
                scored_data = pd.read_csv(scored_file)

                # Ensure columns are numeric
                emotions = ['Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise', 'Neutral']
                for emotion in emotions:
                    scored_data[emotion] = pd.to_numeric(scored_data[emotion], errors='coerce')

                # Process pre-chat and post-chat emotions (custom function not shown here)
                pre_chat_current, pre_chat_desired = process_chat_emotions(pre_chat_file, True)
                post_chat_current = process_chat_emotions(post_chat_file, False)
                
                with open(eval_file, 'r') as f:
                    eval_data = json.load(f)
                
                user_id = scored_data['user_id'].iloc[2] if not pd.isna(scored_data['user_id'].iloc[2]) else scored_data['user_id'].iloc[3]
                
                # Calculate means only for non-zero values, set to 0 if all values are 0
                def calculate_non_zero_mean(column):
                    non_zero_values = scored_data.loc[scored_data[column] != 0, column]
                    return non_zero_values.mean() if not non_zero_values.empty else 0

                new_row = pd.DataFrame({
                    'conv_id': [scored_data['Conv_id'].iloc[0]],
                    'user_id': [user_id],
                    'Pre_Valence': [pre_chat_current['Valence']],
                    'Pre_Arousal': [pre_chat_current['Arousal']],
                    'Pre_Dominance': [pre_chat_current['Dominance']],
                    'Post_Valence': [post_chat_current['Valence']],
                    'Post_Arousal': [post_chat_current['Arousal']],
                    'Post_Dominance': [post_chat_current['Dominance']],
                    'Anger': [calculate_non_zero_mean('Anger')],
                    'Disgust': [calculate_non_zero_mean('Disgust')],
                    'Fear': [calculate_non_zero_mean('Fear')],
                    'Happiness': [calculate_non_zero_mean('Happiness')],
                    'Sadness': [calculate_non_zero_mean('Sadness')],
                    'Surprise': [calculate_non_zero_mean('Surprise')],
                    'Neutral': [calculate_non_zero_mean('Neutral')],
                    'Repetitiveness': [eval_data['Repetitiveness']],
                    'Helpfulness': [eval_data['Helpfulness']],
                    'Intent': [eval_data['Intent']]
                })
                
                regression_data = pd.concat([regression_data, new_row], ignore_index=True)
            except Exception as e:
                print(f"Error processing files in {root}: {str(e)}")
    
    return regression_data

# Usage
regression_data = generate_regression_data('transcript')
regression_data.to_csv('regression_data.csv', index=False)


  regression_data = pd.concat([regression_data, new_row], ignore_index=True)


In [None]:
reg_csv = pd.read_csv('regression_data.csv')
reg_csv.info()