**Merging Notebooks** \
This Notebook has various applications editing and merging various dataframes into one so that statistical and data analysis can be performed.

In [1]:
# Importing dependencies
import os
import pandas as pd
from collections import Counter


In [None]:
#DROPPING DUPLCIATES 

# Step 1: Remove duplicates from each CSV file

folder_path = './LLM_Predictions/Gemma2/'  # Adjust to your folder path containing the CSV files
all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]  # List all CSV files in the folder

for file in all_files:
    file_path = os.path.join(folder_path, file)
    
    # Read the CSV into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Remove duplicates based on the "Content" column
    df_cleaned = df.drop_duplicates(subset=["Content"])
    
    # Save the cleaned DataFrame back to a new CSV file
    cleaned_file_path = os.path.join(folder_path, f"cleaned_{file}")
    df_cleaned.to_csv(cleaned_file_path, index=False)

    print(f"Removed duplicates from {file}. Saved cleaned version as cleaned_{file}.")

In [2]:
#Gathering Majority Votes of the 5 runs by the LLMS over the prompts

# Step 1: Specify the folder containing your CSV files
folder_path = "C:/Users/rickv/OneDrive/Bureaublad/Information Sciences Master/Social Web/LLM_Predictions/Gemma2" # Replace with the path to your folder

# Step 2: Initialize a list to store dataframes
dataframes = []

# Step 3: Loop through each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Process only CSV files
        file_path = os.path.join(folder_path, file_name)
        print(f"Processing file: {file_name}")
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Check if 'Level_1_category' column exists
        if 'Level_1_Category' in df.columns:
            # Convert `Level_1_category` column to lowercase
            df['Level_1_Category'] = df['Level_1_Category'].str.lower()
            dataframes.append(df)
        else:
            print(f"Warning: 'Level_1_category' not found in {file_name}")

# Step 4: Extract the Level_1_category columns and combine them side-by-side
level_1_columns = [df['Level_1_Category'] for df in dataframes]
combined_df = pd.concat(level_1_columns, axis=1, keys=[f"File_{i+1}" for i in range(len(level_1_columns))])

# Step 5: Calculate the majority vote for each row
majority_vote = combined_df.apply(lambda row: Counter(row).most_common(1)[0][0], axis=1)

# Step 6: Add the Majority_Vote column to the combined DataFrame
combined_df['Majority_Vote'] = majority_vote
print(combined_df.head(10))
# Step 7: Save the final DataFrame to a new CSV file
output_file = "majority_vote_with_original_columns.csv"
combined_df.to_csv(output_file, index=False)


print(f"Output file saved as '{output_file}'")


Processing file: cleaned_Classified_Gemma2_iteration_1.csv
Processing file: cleaned_Classified_Gemma2_iteration_2.csv
Processing file: cleaned_Classified_Gemma2_iteration_3.csv
Processing file: cleaned_Classified_Gemma2_iteration_4.csv
Processing file: cleaned_Classified_Gemma2_iteration_5.csv
      File_1     File_2         File_3      File_4     File_5 Majority_Vote
0   links \n   links \n       links \n    links \n   links \n      links \n
1   links \n   links \n     links \n\n  links \n\n   links \n      links \n
2  midden \n  midden \n      midden \n   midden \n  midden \n     midden \n
3  midden \n  midden \n      midden \n    links \n  midden \n     midden \n
4  midden \n  midden \n  midden \n\n\n   midden \n  midden \n     midden \n
5  midden \n  midden \n      midden \n   midden \n  midden \n     midden \n
6  rechts \n  rechts \n      rechts \n   rechts \n  rechts \n     rechts \n
7   links \n   links \n       links \n    links \n   links \n      links \n
8   links \n   links 

In [26]:


# Step 1: Specify the folder containing your CSV files
folder_path = "C:/Users/rickv/OneDrive/Bureaublad/Information Sciences Master/Social Web/LLM_Predictions/Gemma2"  # Replace with the path to your folder

# Step 2: Initialize an empty DataFrame to hold all `Level_1_Category` columns
combined_df = pd.DataFrame()

# Step 3: Loop through each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Process only CSV files
        file_path = os.path.join(folder_path, file_name)
        print(f"Processing file: {file_name}")
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Check if 'Level_1_Category' column exists
        if 'Level_1_Category' in df.columns:
            # Convert to lowercase and strip whitespace/newlines
            combined_df[file_name] = df['Level_1_Category'].str.lower().str.strip()
        else:
            print(f"Warning: 'Level_1_Category' not found in {file_name}")

# Step 4: Calculate the majority vote for each row
majority_vote = combined_df.apply(lambda row: Counter(row).most_common(1)[0][0], axis=1)

# Step 5: Add the Majority_Vote column to the combined DataFrame
combined_df['Majority_Vote'] = majority_vote

# Step 6: Save the final DataFrame to a new CSV file
output_file = "majority_vote_with_cleaned_data.csv"
combined_df.to_csv(output_file, index=False)

print(f"Output file saved as '{output_file}'")


Processing file: Classified_Gemma2_iteration_1.csv
Processing file: Classified_Gemma2_iteration_2.csv
Processing file: Classified_Gemma2_iteration_3.csv
Processing file: Classified_Gemma2_iteration_4.csv
Processing file: Classified_Gemma2_iteration_5.csv
Output file saved as 'majority_vote_with_cleaned_data.csv'


In [29]:
# CHECKING FILES
# Load the CSV file
csv_file_path = "Gemma2_predictions.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_file_path)
df.tail(10)

Unnamed: 0,Classified_Gemma2_iteration_1.csv,Classified_Gemma2_iteration_2.csv,Classified_Gemma2_iteration_3.csv,Classified_Gemma2_iteration_4.csv,Classified_Gemma2_iteration_5.csv,Majority_Vote
3555,links,links,links,links,links,links
3556,midden,midden \n\n\nlet op: het bericht zelf is niet...,midden,n/b \n\n\nlet op: dit bericht citeert een onde...,midden,midden
3557,links,links,links,links,midden,links
3558,links,links,links,links,links,links
3559,midden,midden,midden,midden,midden,midden
3560,links,midden,links,midden,midden,midden
3561,n/b,rechts,rechts,midden,rechts,rechts
3562,links,links,links,links,links,links
3563,links,links,links,links,links,links
3564,links,links,links,links,links,links


In [43]:

# Step 1: Specify the folder containing your CSV files
folder_path = "C:/Users/rickv/OneDrive/Bureaublad/Information Sciences Master/Social Web/LLM_Predictions/Gemma2"  # Replace with the path to your folder

# Step 2: Initialize an empty DataFrame to hold all `Level_1_Category` columns
combined_df = pd.DataFrame()

# Step 3: Define valid entries
valid_entries = {"links", "rechts", "midden", "n/b"}

# Step 4: Loop through each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):  # Process only CSV files
        file_path = os.path.join(folder_path, file_name)
        print(f"Processing file: {file_name}")
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Check if 'Level_1_Category' column exists
        if 'Level_1_Category' in df.columns:
            # Convert to lowercase, strip whitespace/newlines, and validate entries
            cleaned_column = df['Level_1_Category'].str.lower().str.strip()
            cleaned_column = cleaned_column.apply(lambda x: x if x in valid_entries else "n/b")
            combined_df[file_name] = cleaned_column
        else:
            print(f"Warning: 'Level_1_Category' not found in {file_name}")

# Step 5: Calculate the majority vote for each row
majority_vote = combined_df.apply(lambda row: Counter(row).most_common(1)[0][0], axis=1)

# Step 6: Add the Majority_Vote column to the combined DataFrame
combined_df['Majority_Vote'] = majority_vote

# Step 7: Save the final DataFrame to a new CSV file
output_file = "Gemma2_predictions.csv"
combined_df.to_csv(output_file, index=False)

print(f"Output file saved as '{output_file}'")


Processing file: cleaned_Classified_Gemma2_iteration_1.csv
Processing file: cleaned_Classified_Gemma2_iteration_2.csv
Processing file: cleaned_Classified_Gemma2_iteration_3.csv
Processing file: cleaned_Classified_Gemma2_iteration_4.csv
Processing file: cleaned_Classified_Gemma2_iteration_5.csv
Output file saved as 'Gemma2_predictions.csv'


In [48]:

df2 = pd.read_csv("all_toots_with_standpoint_V2.csv")

df1 = pd.read_csv("Gemma2_predictions.csv")

df0 = pd.read_csv("Llama_predictions.csv")

df2['Gemma2 prediction'] = df1['Majority_Vote']
df2['Llama prediction'] = df0['Majority_Vote']

print(df2.head(10))
output_file = "All_predictions.csv"
df2.to_csv(output_file, index=False)

               Username                                            Content  \
0            helladeboo  Acuut stoppen van #medicatie omdat dit niet me...   
1          RonjaBiernat  Audio-documentaire: "Geen kleine man""70 tot 9...   
2            ErikJonker  Er zitten veel aannames en verwachtingen in on...   
3            ErikJonker  Het kabinet was zo blij met de overeenstemming...   
4            ErikJonker  De invloed van Maurice de Hond is gezien de ve...   
5             leeralles  Zo werkt gezondheidszorg: de dokter, het zieke...   
6            AlexandraB  “Wat de #gezondheidszorg duur maakt zijn onnod...   
7            helladeboo  De discussie over vaccineren gaat vooral over ...   
8            AlexandraB  Patiënten lopen risico doordat #gezondheidszor...   
9  forumstandaardisatie  SKOS: het termennetwerk voor betere vindbare i...   

             Date/Time DateTime political_standpoint Gemma2 prediction  \
0  2024-11-30 22:11:52      NaN                  NaN             li

In [52]:
# Converting some entries in the answer columns to uniform answers.
# Step 1: Convert the column to lowercase
df2['political_standpoint'] = df2['political_standpoint'].str.lower()

# Step 2: Replace textual 'NaN' (e.g., 'nan', 'nAn') with 'n/b'
df2['political_standpoint'] = df2['political_standpoint'].replace({'nan': 'n/b', 'n/a': 'n/b'})

# Step 3: Replace actual NaN values (None or np.nan) with 'n/b'
df2['political_standpoint'] = df2['political_standpoint'].fillna('n/b')

# Step 4: Replace 'left' -> 'links', 'right' -> 'rechts', 'center' -> 'midden'
df2['political_standpoint'] = df2['political_standpoint'].replace({
    'left': 'links',
    'right': 'rechts',
    'center': 'midden'
})


# Step 4: Display the transformed DataFrame
print(df2.head(10))

               Username                                            Content  \
0            helladeboo  Acuut stoppen van #medicatie omdat dit niet me...   
1          RonjaBiernat  Audio-documentaire: "Geen kleine man""70 tot 9...   
2            ErikJonker  Er zitten veel aannames en verwachtingen in on...   
3            ErikJonker  Het kabinet was zo blij met de overeenstemming...   
4            ErikJonker  De invloed van Maurice de Hond is gezien de ve...   
5             leeralles  Zo werkt gezondheidszorg: de dokter, het zieke...   
6            AlexandraB  “Wat de #gezondheidszorg duur maakt zijn onnod...   
7            helladeboo  De discussie over vaccineren gaat vooral over ...   
8            AlexandraB  Patiënten lopen risico doordat #gezondheidszor...   
9  forumstandaardisatie  SKOS: het termennetwerk voor betere vindbare i...   

             Date/Time DateTime political_standpoint Gemma2 prediction  \
0  2024-11-30 22:11:52      NaN                  n/b             li

In [53]:
output_file = "All_predictions_V2.csv"
df2.to_csv(output_file, index=False)

In [54]:
# Load the CSV file
csv_file_path = "All_predictions_V2.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_file_path)

# Save the DataFrame as an Excel file
excel_file_path = "All_predictions_V2.xlsx"  # Replace with the desired Excel file path
df.to_excel(excel_file_path, index=False)  # Set index=False to exclude row numbers in Excel

print(f"CSV file converted to Excel and saved as {excel_file_path}")

CSV file converted to Excel and saved as All_predictions_V2.xlsx


In [57]:
# Majority Vote of the LLMS
# Load the CSV file
csv_file_path = "All_predictions_V2.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_file_path) 

# Step 1: Apply majority vote to each row by considering the three columns
def majority_vote(row):
    # Use Counter to count occurrences and return the most common value
    return Counter(row).most_common(1)[0][0]

# Step 2: Apply the function to the three columns
df['Majority Vote'] = df[['political_standpoint', 'Gemma2 prediction', 'Llama prediction']].apply(majority_vote, axis=1)
print(df.head(10))

output_file = "All_predictions_MajorityVote.csv"
df.to_csv(output_file, index=False)

               Username                                            Content  \
0            helladeboo  Acuut stoppen van #medicatie omdat dit niet me...   
1          RonjaBiernat  Audio-documentaire: "Geen kleine man""70 tot 9...   
2            ErikJonker  Er zitten veel aannames en verwachtingen in on...   
3            ErikJonker  Het kabinet was zo blij met de overeenstemming...   
4            ErikJonker  De invloed van Maurice de Hond is gezien de ve...   
5             leeralles  Zo werkt gezondheidszorg: de dokter, het zieke...   
6            AlexandraB  “Wat de #gezondheidszorg duur maakt zijn onnod...   
7            helladeboo  De discussie over vaccineren gaat vooral over ...   
8            AlexandraB  Patiënten lopen risico doordat #gezondheidszor...   
9  forumstandaardisatie  SKOS: het termennetwerk voor betere vindbare i...   

              DateTime political_standpoint Gemma2 prediction  \
0  2024-11-30 22:11:52                  n/b             links   
1  2024-10-