In [1]:
import os
import pandas as pd

# Define the base directory path
base_dir = r"C:\Users\HP\Desktop\I'mBesideYou"

# List of candidate folders (assuming they are named '1', '2', ..., '10')
candidate_folders = [str(i) for i in range(1, 11)]

# Define file paths for each candidate's datasets
emotion_paths = [os.path.join(base_dir, 'emotion_data', folder, 'emotion.csv') for folder in candidate_folders]
gaze_paths = [os.path.join(base_dir, 'emotion_data', folder, 'gaze.csv') for folder in candidate_folders]
metadata_paths = [os.path.join(base_dir, 'emotion_data', folder, 'metadata.csv') for folder in candidate_folders]
transcript_paths = [os.path.join(base_dir, 'transcript_data', f'{i}.csv') for i in range(1, 11)]


Table structures of the files provided

In [5]:
emotion_df = pd.read_csv(r"C:\Users\HP\Desktop\I'mBesideYou\emotion_data\1\emotion.csv")
print(emotion_df.info())
gaze_df = pd.read_csv(r"C:\Users\HP\Desktop\I'mBesideYou\emotion_data\1\gaze.csv")
print(gaze_df.info())
metadata_df = pd.read_csv(r"C:\Users\HP\Desktop\Back_up\Original\emotion_data\1\metadata.csv")
print(metadata_df.info())
transcript_df = pd.read_csv(r"C:\Users\HP\Desktop\Back_up\Original\transcript_data\1.csv")
print(transcript_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie_id          87 non-null     object 
 1   image_seq         87 non-null     int64  
 2   angry             87 non-null     float64
 3   disgust           87 non-null     float64
 4   fear              87 non-null     float64
 5   happy             87 non-null     float64
 6   sad               87 non-null     float64
 7   surprise          87 non-null     float64
 8   neutral           87 non-null     float64
 9   dominant_emotion  87 non-null     object 
dtypes: float64(7), int64(1), object(2)
memory usage: 6.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   movie_id    88 non-null     object 
 1   image_seq   88 non-null     int64  


Remove Columns and Display First Rows

In [2]:
# Define columns to remove
metadata_columns_to_remove = ['movie_id', 'participant_id', 'upload_time', 'distance']
transcript_columns_to_remove = ['seek', 'tokens', 'temperature', 'avg_logprob', 'compression_ratio', 'no_speech_prob']

# Create empty lists to store modified DataFrames
metadata_dfs = []
transcript_dfs = []

# Loop through candidate folders
for folder in candidate_folders:
    metadata_path = os.path.join(base_dir, 'emotion_data', folder, 'metadata.csv')
    transcript_path = os.path.join(base_dir, 'transcript_data', f'{folder}.csv')

    # Read metadata.csv and remove specified columns
    metadata_df = pd.read_csv(metadata_path)
    metadata_df = metadata_df.drop(columns=metadata_columns_to_remove)

    # Read transcript_data and remove specified columns
    transcript_df = pd.read_csv(transcript_path)
    transcript_df = transcript_df.drop(columns=transcript_columns_to_remove)

    # Append modified DataFrames to the lists
    metadata_dfs.append(metadata_df)
    transcript_dfs.append(transcript_df)

    # Display the first two rows
    print(f"Metadata.csv - Candidate {folder}:\n", metadata_df.head(2))
    print(f"Transcript Data - Candidate {folder}:\n", transcript_df.head(2))


Metadata.csv - Candidate 1:
    image_seq  elapsed_time
0          6           7.0
1          7           8.0
Transcript Data - Candidate 1:
    id  start   end                                               text  \
0   0   0.00  5.56   Hello, I am Jeffrey Shepherd and I am current...   
1   1   5.56  9.60   IIM Coikode. I have completed my B.Tech in Bi...   

   positive  negative   neutral  confident  hesitant   concise  enthusiastic  \
0  0.580265  0.152281  0.267454   0.846701  0.845698  0.635805      0.647783   
1  0.550327  0.189263  0.260410   0.679283  0.733701  0.544145      0.417390   

   speech_speed  
0      2.517986  
1      3.217822  
Metadata.csv - Candidate 2:
    image_seq  elapsed_time
0          0           1.0
1          1           2.0
Transcript Data - Candidate 2:
    id  start    end                                               text  \
0   0   0.00   4.32   Hello, I am Beside You. I am Cameron Barajas ...   
1   1   4.32  10.00   today. I recently completed my 

Handling Missing Values

In [3]:
# Create empty lists to store modified DataFrames
emotion_dfs = []
gaze_dfs = []

# Loop through candidate folders
for folder in candidate_folders:
    emotion_path = os.path.join(base_dir, 'emotion_data', folder, 'emotion.csv')
    gaze_path = os.path.join(base_dir, 'emotion_data', folder, 'gaze.csv')

    # Read emotion.csv and handle missing values
    emotion_df = pd.read_csv(emotion_path)
    emotion_df = emotion_df.fillna(0)  # Replace missing values with 0

    # Read gaze.csv and handle missing values
    gaze_df = pd.read_csv(gaze_path)
    gaze_df = gaze_df.fillna(0)  # Replace missing values with 0

    # Append modified DataFrames to the lists
    emotion_dfs.append(emotion_df)
    gaze_dfs.append(gaze_df)

# Display a message confirming missing value handling
print("Missing values handled in emotion.csv and gaze.csv.")


Missing values handled in emotion.csv and gaze.csv.


Removing Duplicate Rows

In [4]:
# Create empty lists to store modified DataFrames
emotion_dfs_no_duplicates = []
gaze_dfs_no_duplicates = []

# Loop through candidate folders
for i, folder in enumerate(candidate_folders):
    # Remove duplicates in emotion.csv
    emotion_df_no_duplicates = emotion_dfs[i].drop_duplicates()

    # Remove duplicates in gaze.csv
    gaze_df_no_duplicates = gaze_dfs[i].drop_duplicates()

    # Append modified DataFrames to the lists
    emotion_dfs_no_duplicates.append(emotion_df_no_duplicates)
    gaze_dfs_no_duplicates.append(gaze_df_no_duplicates)

# Display a message confirming duplicate removal
print("Duplicate rows removed from emotion.csv and gaze.csv.")


Duplicate rows removed from emotion.csv and gaze.csv.


Description and Information of all the tables

In [5]:
# Function to display describe() and info() for a DataFrame
def describe_info(dataframe, name):
    print(f"Summary Statistics for {name}:\n", dataframe.describe())
    print(f"Info for {name}:\n", dataframe.info())

# Apply describe() and info() for emotion.csv, gaze.csv, metadata.csv, and transcript_data tables
for i, folder in enumerate(candidate_folders):
    print(f"\nCandidate {folder} Data:")
    describe_info(emotion_dfs_no_duplicates[i], f"Emotion.csv (Candidate {folder})")
    describe_info(gaze_dfs_no_duplicates[i], f"Gaze.csv (Candidate {folder})")
    describe_info(metadata_dfs[i], f"Metadata.csv (Candidate {folder})")
    describe_info(transcript_dfs[i], f"Transcript Data (Candidate {folder})")



Candidate 1 Data:
Summary Statistics for Emotion.csv (Candidate 1):
        image_seq      angry       disgust       fear      happy        sad  \
count  87.000000  87.000000  8.700000e+01  87.000000  87.000000  87.000000   
mean   45.390805  14.451059  6.168965e-01  18.382797   5.865318  13.575324   
std    27.587643  18.544205  2.679399e+00  25.073562  11.237819  19.787221   
min     0.000000   0.164384  2.400910e-10   0.079219   0.000005   0.000073   
25%    22.500000   1.867450  8.769180e-05   1.862945   0.143215   1.845405   
50%    44.000000   6.412790  3.443590e-03   6.366870   1.476330   5.578010   
75%    68.500000  18.765500  6.486260e-02  21.010100   5.569355  14.056700   
max    94.000000  71.172500  2.150890e+01  94.981800  66.222300  91.563600   

        surprise    neutral  
count  87.000000  87.000000  
mean    8.744969  38.363648  
std    19.621163  33.468718  
min     0.000008   0.000117  
25%     0.210637   8.035385  
50%     0.970922  28.221400  
75%     6.524355 

In [6]:
# Replace original files with modified DataFrames
for i, folder in enumerate(candidate_folders):
    emotion_dfs_no_duplicates[i].to_csv(os.path.join(base_dir, 'emotion_data', folder, 'emotion.csv'), index=False)
    gaze_dfs_no_duplicates[i].to_csv(os.path.join(base_dir, 'emotion_data', folder, 'gaze.csv'), index=False)
    metadata_dfs[i].to_csv(os.path.join(base_dir, 'emotion_data', folder, 'metadata.csv'), index=False)
    transcript_dfs[i].to_csv(os.path.join(base_dir, 'transcript_data', f'{folder}.csv'), index=False)

print("Modified tables replaced the original tables.")


Modified tables replaced the original tables.


Merge emotion.csv, gaze.csv and metadata.csv to form merged_emotion_i.csv

In [7]:
# Create a new folder for merged emotion data if it doesn't exist
merged_emotion_dir = r"C:\Users\HP\Desktop\I'mBesideYou\merged_emotion"
os.makedirs(merged_emotion_dir, exist_ok=True)

# Create empty lists to store merged DataFrames
merged_emotion_dfs = []

# Loop through candidate folders
for folder in candidate_folders:
    gaze_path = os.path.join(base_dir, 'emotion_data', folder, 'gaze.csv')

    # Read gaze.csv and remove "movie_id" column
    gaze_df = pd.read_csv(gaze_path)
    gaze_df = gaze_df.drop(columns=['movie_id'])

    # Read emotion.csv and metadata.csv
    emotion_df = emotion_dfs_no_duplicates[int(folder) - 1]
    metadata_df = metadata_dfs[int(folder) - 1]

    # Merge emotion.csv, gaze.csv, and metadata.csv based on "image_seq"
    merged_df = pd.merge(emotion_df, gaze_df, on='image_seq')
    merged_df = pd.merge(merged_df, metadata_df, on='image_seq')

    # Save the merged DataFrame as merged_emotion_i.csv
    merged_filename = os.path.join(merged_emotion_dir, f'merged_emotion_{folder}.csv')
    merged_df.to_csv(merged_filename, index=False)

    # Append the merged DataFrame to the list
    merged_emotion_dfs.append(merged_df)

    # Display the first two rows of the merged DataFrame
    print(f"Merged Emotion Data - Candidate {folder}:\n", merged_df.head(2))


Merged Emotion Data - Candidate 1:
                                movie_id  image_seq     angry   disgust  \
0  93663f94-bf0a-4ce8-a29a-a5236cc7fe6a          6   6.41279  0.000239   
1  93663f94-bf0a-4ce8-a29a-a5236cc7fe6a          7  29.81320  1.365940   

       fear     happy       sad  surprise  neutral dominant_emotion  gaze  \
0   4.53791  0.134349   3.56569  0.555717  84.7933          neutral     0   
1  31.50510  5.555130  11.35700  2.189640  18.2140             fear     1   

   blink  eye_offset  elapsed_time  
0      0     26.8643           7.0  
1      0      1.9027           8.0  
Merged Emotion Data - Candidate 2:
                                movie_id  image_seq      angry   disgust  \
0  baa26895-85b2-465b-a972-649b41d9870e          1   0.179621  0.000185   
1  baa26895-85b2-465b-a972-649b41d9870e          2  10.126300  0.087004   

       fear    happy       sad   surprise   neutral dominant_emotion  gaze  \
0  0.055258  93.5664   6.18999   0.001184  0.007402       

Consolidating time intervals and eliminating gaps in the transcript_data tables 

In [8]:
# Create empty lists to store modified DataFrames for transcript data
transcript_dfs_updated = []

# Loop through candidate folders
for i, folder in enumerate(candidate_folders):
    transcript_path = os.path.join(base_dir, 'transcript_data', f'{folder}.csv')

    # Read transcript_data
    transcript_df = pd.read_csv(transcript_path)

    # Initialize a flag to keep track of modifications
    modified = False

    # Loop through rows to check and update values
    for row in range(1, len(transcript_df)):
        prev_end = transcript_df.loc[row - 1, 'end']
        current_start = transcript_df.loc[row, 'start']

        # Check if "end" of (i-1)th row is not equal to "start" of (i)th row
        if prev_end != current_start:
            # Calculate the average of previous values
            avg_value = (prev_end + current_start) / 2.0

            # Update "end" of (i-1)th row and "start" of (i)th row
            transcript_df.loc[row - 1, 'end'] = avg_value
            transcript_df.loc[row, 'start'] = avg_value

            # Set the modified flag
            modified = True

    # Append the modified DataFrame to the list
    transcript_dfs_updated.append(transcript_df)

    # Display a message if modifications were made
    if modified:
        print(f"Modified Transcript Data - Candidate {folder}:\n", transcript_df.head(2))
    else:
        print(f"No modifications needed for Transcript Data - Candidate {folder}.")

# Save the updated transcript data to files
for i, folder in enumerate(candidate_folders):
    updated_transcript_path = os.path.join(base_dir, 'transcript_data', f'{folder}_updated.csv')
    transcript_dfs_updated[i].to_csv(updated_transcript_path, index=False)

print("Updated transcript data saved.")


No modifications needed for Transcript Data - Candidate 1.
Modified Transcript Data - Candidate 2:
    id  start    end                                               text  \
0   0   0.00   4.32   Hello, I am Beside You. I am Cameron Barajas ...   
1   1   4.32  10.00   today. I recently completed my BBA in 2022. I...   

   positive  negative   neutral  confident  hesitant   concise  enthusiastic  \
0  0.909206  0.015431  0.075362   0.976302  0.020649  0.849303      0.998064   
1  0.660675  0.052640  0.286685   0.968629  0.741091  0.649625      0.701379   

   speech_speed  
0      4.166667  
1      2.992958  
No modifications needed for Transcript Data - Candidate 3.
No modifications needed for Transcript Data - Candidate 4.
No modifications needed for Transcript Data - Candidate 5.
Modified Transcript Data - Candidate 6:
    id  start   end                                               text  \
0   0   0.00  5.28   Hi, my name is Nathan Lewis. I'm a first year...   
1   1   5.28  9.40

In [9]:
# Delete the original "i.csv" files in transcript_data folder
for i in range(1, 11):
    original_transcript_path = os.path.join(base_dir, 'transcript_data', f'{i}.csv')
    
    # Check if the file exists and delete it
    if os.path.exists(original_transcript_path):
        os.remove(original_transcript_path)
        print(f"Deleted original transcript file for Candidate {i}.")
    else:
        print(f"Original transcript file for Candidate {i} not found.")


Deleted original transcript file for Candidate 1.
Deleted original transcript file for Candidate 2.
Deleted original transcript file for Candidate 3.
Deleted original transcript file for Candidate 4.
Deleted original transcript file for Candidate 5.
Deleted original transcript file for Candidate 6.
Deleted original transcript file for Candidate 7.
Deleted original transcript file for Candidate 8.
Deleted original transcript file for Candidate 9.
Deleted original transcript file for Candidate 10.


In [10]:
# Function to round off the last row's "end" column to the upper integer value
def round_last_end_to_integer(dataframe):
    last_row_index = len(dataframe) - 1
    dataframe.loc[last_row_index, 'end'] = int(dataframe.loc[last_row_index, 'end'] + 0.5)

# Loop through candidate folders to round off and replace "i_updated.csv" files
for i, folder in enumerate(candidate_folders):
    updated_transcript_path = os.path.join(base_dir, 'transcript_data', f'{folder}_updated.csv')
    
    if os.path.exists(updated_transcript_path):
        # Read the updated transcript data
        updated_transcript_df = pd.read_csv(updated_transcript_path)
        
        # Round off the last row's "end" column to the upper integer value
        round_last_end_to_integer(updated_transcript_df)
        
        # Save the modified transcript data, replacing the original "i_updated.csv" file
        updated_transcript_df.to_csv(updated_transcript_path, index=False)
        print(f"Modified transcript file for Candidate {folder} saved and last 'end' value rounded off.")
    else:
        print(f"Updated transcript file for Candidate {folder} not found.")

print("Rounding off and replacing completed.")


Modified transcript file for Candidate 1 saved and last 'end' value rounded off.
Modified transcript file for Candidate 2 saved and last 'end' value rounded off.
Modified transcript file for Candidate 3 saved and last 'end' value rounded off.
Modified transcript file for Candidate 4 saved and last 'end' value rounded off.
Modified transcript file for Candidate 5 saved and last 'end' value rounded off.
Modified transcript file for Candidate 6 saved and last 'end' value rounded off.
Modified transcript file for Candidate 7 saved and last 'end' value rounded off.
Modified transcript file for Candidate 8 saved and last 'end' value rounded off.
Modified transcript file for Candidate 9 saved and last 'end' value rounded off.
Modified transcript file for Candidate 10 saved and last 'end' value rounded off.
Rounding off and replacing completed.


merged_emotion tables are merged with the transcript_data tables

In [2]:
# Create a new folder for overall data if it doesn't exist
overall_data_dir = os.path.join(base_dir, 'overall_data')
os.makedirs(overall_data_dir, exist_ok=True)

# Loop through candidate folders
for i, folder in enumerate(candidate_folders):
    # Read merged_emotion_i.csv and i_updated.csv
    merged_emotion_path = os.path.join(base_dir, 'merged_emotion', f'merged_emotion_{folder}.csv')
    updated_transcript_path = os.path.join(base_dir, 'transcript_data', f'{folder}_updated.csv')

    if os.path.exists(merged_emotion_path) and os.path.exists(updated_transcript_path):
        # Read merged_emotion_i.csv and i_updated.csv
        merged_emotion_df = pd.read_csv(merged_emotion_path)
        updated_transcript_df = pd.read_csv(updated_transcript_path)

        # Initialize lists to store rows for the merged table
        merged_rows = []

        # Loop through rows in updated_transcript_df
        for index, row in updated_transcript_df.iterrows():
            # Extract the start and end times from the current row
            start_time = row['start']
            end_time = row['end']

            # Filter rows in merged_emotion_df based on elapsed_time within the range
            filtered_rows = merged_emotion_df[
                (merged_emotion_df['elapsed_time'] >= start_time) &
                (merged_emotion_df['elapsed_time'] <= end_time)
            ]

            # If there are matching rows in merged_emotion_df, calculate averages
            if not filtered_rows.empty:
                avg_row = filtered_rows.mean(numeric_only=True)
                avg_row['start'] = start_time
                avg_row['end'] = end_time
                merged_rows.append(avg_row)

        # Create the merged DataFrame for the candidate
        candidate_merged_df = pd.DataFrame(merged_rows)

        # Set 'movie_id' to the value from the first row of merged_emotion_i.csv
        candidate_merged_df['movie_id'] = merged_emotion_df['movie_id'].iloc[0]

        # Remove 'image_seq' and 'elapsed_time' columns
        candidate_merged_df = candidate_merged_df.drop(columns=['image_seq', 'elapsed_time'])

        # Reorder columns to match the desired order
        column_order = ['movie_id', 'start', 'end', 'angry', 'disgust', 'fear', 'happy', 'sad',
                        'surprise', 'neutral', 'gaze', 'blink', 'eye_offset']
        candidate_merged_df = candidate_merged_df[column_order]

        # Save the merged DataFrame as overall_data_i.csv
        overall_data_path = os.path.join(overall_data_dir, f'overall_data_{folder}.csv')
        candidate_merged_df.to_csv(overall_data_path, index=False)

        print(f"Merged data saved for Candidate {folder} in overall_data_{folder}.csv.")
    else:
        print(f"Files not found for Candidate {folder}. Skipping.")

print("Merging and saving completed.")


Merged data saved for Candidate 1 in overall_data_1.csv.
Merged data saved for Candidate 2 in overall_data_2.csv.
Merged data saved for Candidate 3 in overall_data_3.csv.
Merged data saved for Candidate 4 in overall_data_4.csv.
Merged data saved for Candidate 5 in overall_data_5.csv.
Merged data saved for Candidate 6 in overall_data_6.csv.
Merged data saved for Candidate 7 in overall_data_7.csv.
Merged data saved for Candidate 8 in overall_data_8.csv.
Merged data saved for Candidate 9 in overall_data_9.csv.
Merged data saved for Candidate 10 in overall_data_10.csv.
Merging and saving completed.


"dominant_emotion" and "subsequent_emotion" columns are added

In [3]:
# Loop through candidate folders
for i, folder in enumerate(candidate_folders):
    # Read the overall_data_i.csv file
    overall_data_path = os.path.join(overall_data_dir, f'overall_data_{folder}.csv')

    if os.path.exists(overall_data_path):
        # Read the overall data
        overall_data_df = pd.read_csv(overall_data_path)

        # List of emotion columns
        emotion_columns = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']

        # Find the dominant and subsequent emotions for each row
        dominant_emotions = []
        subsequent_emotions = []

        for index, row in overall_data_df.iterrows():
            # Find the column with the maximum value (dominant emotion)
            max_emotion = max(row[emotion_columns])
            dominant_emotion = [col for col in emotion_columns if row[col] == max_emotion][0]

            # Find the column with the second maximum value (subsequent emotion)
            row[dominant_emotion] = 0  # Set the dominant emotion value to 0
            second_max_emotion = max(row[emotion_columns])
            subsequent_emotion = [col for col in emotion_columns if row[col] == second_max_emotion][0]

            dominant_emotions.append(dominant_emotion)
            subsequent_emotions.append(subsequent_emotion)

        # Add "dominant_emotion" and "subsequent_emotion" columns
        overall_data_df['dominant_emotion'] = dominant_emotions
        overall_data_df['subsequent_emotion'] = subsequent_emotions

        # Save the modified DataFrame
        overall_data_df.to_csv(overall_data_path, index=False)

        print(f"Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate {folder}.")
    else:
        print(f"File not found for Candidate {folder}. Skipping.")

print("Adding columns completed.")


Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 1.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 2.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 3.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 4.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 5.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 6.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 7.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 8.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 9.
Added 'dominant_emotion' and 'subsequent_emotion' columns for Candidate 10.
Adding columns completed.


In [4]:
import pandas as pd
import os

# Define the base directory path
base_dir = r"C:\Users\HP\Desktop\I'mBesideYou"

# List of candidate folders (assuming they are named '1', '2', ..., '10')
candidate_folders = [str(i) for i in range(1, 11)]

# Create a new folder for overall data if it doesn't exist
overall_data_dir = os.path.join(base_dir, 'overall_data')
os.makedirs(overall_data_dir, exist_ok=True)

# Loop through candidate folders
for i, folder in enumerate(candidate_folders):
    # Read the overall_data_i.csv and i_updated.csv files
    overall_data_path = os.path.join(overall_data_dir, f'overall_data_{folder}.csv')
    updated_transcript_path = os.path.join(base_dir, 'transcript_data', f'{folder}_updated.csv')

    if os.path.exists(overall_data_path) and os.path.exists(updated_transcript_path):
        # Read the overall data and updated transcript data
        overall_data_df = pd.read_csv(overall_data_path)
        updated_transcript_df = pd.read_csv(updated_transcript_path)

        # Create a set of unique "start" values from the updated transcript data
        unique_start_values = set(updated_transcript_df['start'])

        # Create a DataFrame to store merged rows
        merged_rows = []

        # Iterate through unique "start" values
        for start_value in unique_start_values:
            # Check if the "start" value is missing in overall_data_df
            if start_value not in overall_data_df['start'].values:
                # Create a new row with null values in all columns of overall_data_df
                new_row = {col: None for col in overall_data_df.columns}

                # Set the "start" value in the new row
                new_row['start'] = start_value

                # Append the new row to merged_rows
                merged_rows.append(new_row)

        # Convert merged_rows to a DataFrame
        new_rows_df = pd.DataFrame(merged_rows)

        # Concatenate overall_data_df and new_rows_df using pandas.concat
        overall_data_df = pd.concat([overall_data_df, new_rows_df], ignore_index=True)

        # Merge overall_data_df and updated_transcript_df based on the "start" column
        merged_df = pd.merge(overall_data_df, updated_transcript_df, on='start', how='left')

        # Sort rows in ascending order of "start" values
        merged_df = merged_df.sort_values(by='start', ascending=True)

        # Remove the "id" column
        merged_df = merged_df.drop(columns=['id'])

        # Keep the "end" column from updated_transcript_df
        merged_df['end'] = updated_transcript_df['end']

        # Save the merged DataFrame to the overall_data_i.csv file
        merged_df.to_csv(overall_data_path, index=False)

        print(f"Merged data saved and updated for Candidate {folder} in overall_data_{folder}.csv.")
    else:
        print(f"Files not found for Candidate {folder}. Skipping.")

print("Merging and updating completed.")


Merged data saved and updated for Candidate 1 in overall_data_1.csv.
Merged data saved and updated for Candidate 2 in overall_data_2.csv.
Merged data saved and updated for Candidate 3 in overall_data_3.csv.
Merged data saved and updated for Candidate 4 in overall_data_4.csv.
Merged data saved and updated for Candidate 5 in overall_data_5.csv.
Merged data saved and updated for Candidate 6 in overall_data_6.csv.
Merged data saved and updated for Candidate 7 in overall_data_7.csv.
Merged data saved and updated for Candidate 8 in overall_data_8.csv.
Merged data saved and updated for Candidate 9 in overall_data_9.csv.
Merged data saved and updated for Candidate 10 in overall_data_10.csv.
Merging and updating completed.


In [5]:
import pandas as pd
import os

# Define the base directory path
base_dir = r"C:\Users\HP\Desktop\I'mBesideYou"

# List of candidate folders (assuming they are named '1', '2', ..., '10')
candidate_folders = [str(i) for i in range(1, 11)]

# Loop through candidate folders
for i, folder in enumerate(candidate_folders):
    # Read the overall_data_i.csv file
    overall_data_path = os.path.join(overall_data_dir, f'overall_data_{folder}.csv')

    if os.path.exists(overall_data_path):
        # Read the overall data
        overall_data_df = pd.read_csv(overall_data_path)

        # Replace values in "end_x" column with "end_y" column values
        overall_data_df['end_x'] = overall_data_df['end_y']

        # Remove the "end_y" column
        overall_data_df = overall_data_df.drop(columns=['end_y'])

        # Replace missing values in "movie_id" column with a value from any other row
        movie_id_values = overall_data_df['movie_id'].dropna().values
        overall_data_df['movie_id'] = overall_data_df['movie_id'].fillna(movie_id_values[0])

        # Save the updated DataFrame back to the overall_data_i.csv file
        overall_data_df.to_csv(overall_data_path, index=False)

        print(f"Updated overall_data_{folder}.csv for Candidate {folder}.")
    else:
        print(f"File not found for Candidate {folder}. Skipping.")

print("Updating completed.")


Updated overall_data_1.csv for Candidate 1.
Updated overall_data_2.csv for Candidate 2.
Updated overall_data_3.csv for Candidate 3.
Updated overall_data_4.csv for Candidate 4.
Updated overall_data_5.csv for Candidate 5.
Updated overall_data_6.csv for Candidate 6.
Updated overall_data_7.csv for Candidate 7.
Updated overall_data_8.csv for Candidate 8.
Updated overall_data_9.csv for Candidate 9.
Updated overall_data_10.csv for Candidate 10.
Updating completed.


In [6]:
import pandas as pd
import os

# Define the base directory path
base_dir = r"C:\Users\HP\Desktop\I'mBesideYou"

# List of candidate folders (assuming they are named '1', '2', ..., '10')
candidate_folders = [str(i) for i in range(1, 11)]

# Loop through candidate folders
for i, folder in enumerate(candidate_folders):
    # Read the overall_data_i.csv file
    overall_data_path = os.path.join(overall_data_dir, f'overall_data_{folder}.csv')

    if os.path.exists(overall_data_path):
        # Read the overall data
        overall_data_df = pd.read_csv(overall_data_path)

        # Remove the "end" column
        overall_data_df = overall_data_df.drop(columns=['end'])

        # Rename "end_x" column as "end"
        overall_data_df = overall_data_df.rename(columns={'end_x': 'end'})

        # Rename "neutral_x" column as "neutral_emotion"
        overall_data_df = overall_data_df.rename(columns={'neutral_x': 'neutral_emotion'})

        # Rename "neutral_y" column as "neutral_sentiment"
        overall_data_df = overall_data_df.rename(columns={'neutral_y': 'neutral_sentiment'})

        # Save the updated DataFrame back to the overall_data_i.csv file
        overall_data_df.to_csv(overall_data_path, index=False)

        print(f"Updated overall_data_{folder}.csv for Candidate {folder}.")
    else:
        print(f"File not found for Candidate {folder}. Skipping.")

print("Updating completed.")


Updated overall_data_1.csv for Candidate 1.
Updated overall_data_2.csv for Candidate 2.
Updated overall_data_3.csv for Candidate 3.
Updated overall_data_4.csv for Candidate 4.
Updated overall_data_5.csv for Candidate 5.
Updated overall_data_6.csv for Candidate 6.
Updated overall_data_7.csv for Candidate 7.
Updated overall_data_8.csv for Candidate 8.
Updated overall_data_9.csv for Candidate 9.
Updated overall_data_10.csv for Candidate 10.
Updating completed.


Information and Description of overall_data tables

In [7]:
# Define the file paths
merged_emotion_1_path = r"C:\Users\HP\Desktop\I'mBesideYou\merged_emotion\merged_emotion_1.csv"
updated_transcript_1_path = r"C:\Users\HP\Desktop\I'mBesideYou\transcript_data\1_updated.csv"
overall_data_1_path = r"C:\Users\HP\Desktop\I'mBesideYou\overall_data\overall_data_1.csv"

# Load the datasets
merged_emotion_1_df = pd.read_csv(merged_emotion_1_path)
updated_transcript_1_df = pd.read_csv(updated_transcript_1_path)
overall_data_1_df = pd.read_csv(overall_data_1_path)

# Display table information for merged_emotion_1.csv
print("Table Information for merged_emotion_1.csv:")
print(merged_emotion_1_df.info())
print("\nSummary Statistics for merged_emotion_1.csv:")
print(merged_emotion_1_df.describe())

# Display table information for 1_updated.csv
print("\nTable Information for 1_updated.csv:")
print(updated_transcript_1_df.info())
print("\nSummary Statistics for 1_updated.csv:")
print(updated_transcript_1_df.describe())

# Display table information for overall_data_1.csv
print("\nTable Information for overall_data_1.csv:")
print(overall_data_1_df.info())
print("\nSummary Statistics for overall_data_1.csv:")
print(overall_data_1_df.describe())


Table Information for merged_emotion_1.csv:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   movie_id          27 non-null     object 
 1   image_seq         27 non-null     int64  
 2   angry             27 non-null     float64
 3   disgust           27 non-null     float64
 4   fear              27 non-null     float64
 5   happy             27 non-null     float64
 6   sad               27 non-null     float64
 7   surprise          27 non-null     float64
 8   neutral           27 non-null     float64
 9   dominant_emotion  27 non-null     object 
 10  gaze              27 non-null     int64  
 11  blink             27 non-null     int64  
 12  eye_offset        27 non-null     float64
 13  elapsed_time      27 non-null     float64
dtypes: float64(9), int64(3), object(2)
memory usage: 3.1+ KB
None

Summary Statistics for merged_emo