In [1]:
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re
from nltk.util import bigrams

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/md.rafiulbiswas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Load Arabic stop words from list.txt
with open('/content/drive/MyDrive/Facebook Crowdtangle Data/All Zied Download Folder/Renamed by Rafi/Zied Search Term/list.txt', 'r', encoding='utf-8') as file:
    arabic_stop_words = set(file.read().splitlines())

# Get English stop words from nltk
english_stop_words = set(stopwords.words('english'))

# Combine Arabic and English stop words
stop_words = arabic_stop_words.union(english_stop_words)

In [None]:
# Clean text: remove stop words, punctuation, and non-alphabetic characters
def clean_text(text):
    if not isinstance(text, str):  # Check if the value is not a string
        return ""  # Return an empty string for non-string values
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\u0600-\u06FF\s]', '', text)  # Keep Arabic and English letters
    text = text.lower()
    # Remove stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

# Function to process bigrams and calculate the top 1000
def extract_bigrams(df):
    # Apply cleaning function to the 'Message' column
    df['Cleaned_Message'] = df['Message'].apply(clean_text)

    # Combine all cleaned messages into a single string
    all_words = " ".join(df['Cleaned_Message']).split()

    # Generate bigrams from the list of words
    all_bigrams = list(bigrams(all_words))

    # Count bigram frequencies
    bigram_counts = Counter(all_bigrams)

    # Get the top 500 bigrams
    top_500_bigrams = bigram_counts.most_common(500)
    return top_500_bigrams

In [None]:
# Folder containing the CSV files (top-level folder)
folder_path = "/content/drive/MyDrive/Facebook Crowdtangle Data/All Zied Download Folder/Renamed by Rafi/Categorized by Maryam/Economy/Currencies"  # Replace with the path to your folder
output_excel_file = "data_summary.xlsx"  # Output Excel file name
date_column = "Post Created Date"  # Replace with the name of the date column in your CSV files
interaction_column = "Total Interactions (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )"

# Initialize a list to store the results
summary_data = []

In [None]:
# Process CSV file in the root directory (not just subdirectories)
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        print(f"Processing file: {file_name}")

        # Read the file
        try:
            # Read necessary columns
            df = pd.read_csv(file_path, usecols=[date_column, interaction_column, 'Message'])

            # Convert the date column to datetime
            df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
            df = df.dropna(subset=[date_column])  # Drop rows with invalid dates

            # Process Total Interactions
            # Ensure 'Total Interactions' is numeric, removing commas first
            df['Total Interactions'] = df['Total Interactions (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )'].replace(',', '', regex=True)
            df['Total Interactions'] = pd.to_numeric(df['Total Interactions'], errors='coerce')

            df['Total Interactions'] = df['Total Interactions'].fillna(0)
            total_sum = df['Total Interactions'].sum()  # Calculate total interactions

            # Calculate the date range
            if not df.empty:
                min_date = df[date_column].min()
                max_date = df[date_column].max()
                # Extract the message with the highest interactions
                highest_interaction_row = df.loc[df['Total Interactions'].idxmax()]
                highest_message = highest_interaction_row['Message']
                highest_interactions = highest_interaction_row['Total Interactions']
                # Extract top 1000 bigrams
                top_1000_bigrams = extract_bigrams(df)
                bigram_text = "\n".join([f"{' '.join(bigram)}: {count}" for bigram, count in top_1000_bigrams])
                # Add summary data
                summary_data.append({
                    "File Name": file_name,
                    "Min Date": min_date,
                    "Max Date": max_date,
                    "Total Interactions": total_sum,
                    "Highest Interaction Message": highest_message,
                    "Highest Interactions": highest_interactions,
                    "Top 1000 Bigrams": bigram_text
                })
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

        # Release memory by deleting the DataFrame
        del df

In [None]:
# Traverse the folder and its subfolders
for root, dirs, files in os.walk(folder_path):
    for file_name in files:
        if file_name.endswith(".csv"):
            file_path = os.path.join(root, file_name)
            print(f"Processing file: {file_name}")

            # Read the file
            try:
                # Read necessary columns
                df = pd.read_csv(file_path, usecols=[date_column, interaction_column, 'Message'])

                # Convert the date column to datetime
                df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
                df = df.dropna(subset=[date_column])  # Drop rows with invalid dates

                # Process Total Interactions
                # Ensure 'Total Interactions' is numeric, removing commas first
                df['Total Interactions'] = df['Total Interactions (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )'].replace(',', '', regex=True)
                df['Total Interactions'] = pd.to_numeric(df['Total Interactions'], errors='coerce')

                df['Total Interactions'] = df['Total Interactions'].fillna(0)
                total_sum = df['Total Interactions'].sum()  # Calculate total interactions

                # Calculate the date range
                if not df.empty:
                    min_date = df[date_column].min()
                    max_date = df[date_column].max()
                    # Extract the message with the highest interactions
                    highest_interaction_row = df.loc[df['Total Interactions'].idxmax()]
                    highest_message = highest_interaction_row['Message']
                    highest_interactions = highest_interaction_row['Total Interactions']
                    # Extract top 1000 bigrams
                    top_1000_bigrams = extract_bigrams(df)
                    bigram_text = "\n".join([f"{' '.join(bigram)}: {count}" for bigram, count in top_1000_bigrams])
                    # Add summary data
                    summary_data.append({
                        "File Name": file_name,
                        "Min Date": min_date,
                        "Max Date": max_date,
                        "Total Interactions": total_sum,
                        "Highest Interaction Message": highest_message,
                        "Highest Interactions": highest_interactions,
                        "Top 1000 Bigrams": bigram_text
                    })
            except Exception as e:
                print(f"Error processing file {file_name}: {e}")

            # Release memory by deleting the DataFrame
            del df

# Write the summary to an Excel file
summary_df = pd.DataFrame(summary_data)
summary_df.to_excel(output_excel_file, index=False)
print(f"Summary saved to {output_excel_file}")