# Purpose of notebook: inspect suspect months

In [2]:
## install
%pip install pandas
%pip install chardet


Note: you may need to restart the kernel to use updated packages.
Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
Installing collected packages: chardet
Successfully installed chardet-5.2.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
## import
import os
import pandas as pd
import chardet
import re
from bs4 import BeautifulSoup
from datetime import datetime

## 21-08

### Sources

In [15]:
# specify the file path
file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-21-08.txt"  # Replace 'your_file_name.txt' with the actual file name

# check if the path is a valid file and ends with '.txt'
if os.path.isfile(file_path) and file_path.endswith('.txt'):
    try:
        # read the file into a dataframe, skipping bad lines
        sources_21_08 = pd.read_csv(
            file_path, 
            delimiter='\t', 
            encoding='ISO-8859-1',
            on_bad_lines='skip'  # Skip lines with too many or too few fields
        )
        
        # define and assign column names
        column_names = ["textID", "words", "date", "country", "source", "url", "headline"]
        sources_21_08.columns = column_names
        
        # display the dataframe
        sources_21_08
    except pd.errors.ParserError as e:
        print(f"ParserError in file: {file_path}")
        print(e)
        # log the problematic file
        with open("/work/LauraSørineVoldgaard#8191/problematic_files.log", "a") as log_file:
            log_file.write(f"ParserError in file: {file_path}\n")
    except Exception as e:
        print(f"General error in file: {file_path}. Error: {e}")
else:
    print(f"The specified file does not exist or is not a '.txt' file: {file_path}")


sources_21_08

# Filter the DataFrame for rows where the "country" column is "US"
sources_21_08 = sources_21_08[sources_21_08["country"] == "US"]

# Display the filtered DataFrame
sources_21_08

Unnamed: 0,textID,words,date,country,source,url,headline
0,33086945,735,21-08-01,US,forbes.com,https://www.forbes.com/sites/michaelalpiner/20...,Lackawanna Coal Mine Tour Offers Travelers A R...
1,33086946,1185,21-08-01,US,forbes.com,https://www.forbes.com/sites/splunk/2021/08/01...,Modernizing The Mission: How Data Innovation B...
2,33086959,227,21-08-01,US,npr.org,https://www.npr.org/programs/morning-edition/2...,"Morning Edition for July 29, 2021 : NPR"
3,33086961,506,21-08-01,US,nbcnews.com,https://www.nbcnews.com/politics/congress/sena...,Senate introduces the details of the bipartisa...
4,33086966,1034,21-08-01,US,theringer.com,https://www.theringer.com/nba/2021/7/28/225989...,Could Trading for Buddy Hield Put LeBron and t...
...,...,...,...,...,...,...,...
150007,87855805,233,21-08-31,US,WFMZ-TV,https://www.wfmz.com/sports/hartford-holds-rea...,"Hartford holds Reading scoreless, wins 1-0"
150008,87855806,1789,21-08-31,US,W magazine,https://www.wmagazine.com/culture/christopher-...,Christopher Meloni and Lewis Hamilton Share a ...
150009,87855807,921,21-08-31,US,WMUR9,https://www.wmur.com/article/nh-health-provide...,NH health providers prepare for booster shots ...
150010,87855808,923,21-08-31,US,Yahoo,https://www.yahoo.com/lifestyle/10-delicious-f...,10 Delicious Fall Food Tours Across the U.S.


In [16]:
# Define the file path where you want to save the CSV
output_file_path = "/work/LauraSørineVoldgaard#8191/data/sus_sources/sources_21_08_susnomore.csv"

# Save the filtered DataFrame as a CSV file
sources_21_08.to_csv(output_file_path, index=False, encoding='utf-8')

### Text

In [17]:
# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-21-08/21-08-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-21-08/21-08-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-21-08/21-08-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-21-08/21-08-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-21-08/21-08-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_21_08 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
print(text_21_08)


          textID                                               body
0       33086700  A duplicate content penalty can devastate your...
1       33086701  Minnesota businesses raised nearly $500 millio...
2       33086911  Just two months after the Vessel, a honeycomb-...
3       33087001  TOKYO ( AP ) -- Marcell Jacobs won the men's O...
4       33087004  On Saturday, a fleet of Jamaican sprinters swe...
...          ...                                                ...
154406  87855788  New analysis of office activity in July shows ...
154407  87855794  Offensive coordinator Zach Grossiduring, right...
154408  87855795  What role do loyalty schemes and personalizati...
154409  87855798  The Twilight actress, 31, will star as Diana i...
154410  87855799  Sponsored content. Us Weekly receives compensa...

[154411 rows x 2 columns]


In [18]:
# Create the "sus_months" dataframe
sus_months = pd.DataFrame({
    "month": ["21-08"],  # Column for the month
    "sources": [len(sources_21_08)],  # Number of rows in sources_21_08
    "texts": [len(text_21_08)]  # Number of rows in text_21_08
})

sus_months

Unnamed: 0,month,sources,texts
0,21-08,150012,154411


## 22-10

### Sources

In [23]:
import pandas as pd
import html
import csv  # Required for quoting constants like QUOTE_NONE

file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-22-10.txt"

# Initialize lists to hold valid and invalid rows
valid_rows = []
invalid_rows = []

# Set maximum row limit
max_rows_to_process = 313875

try:
    # Read the file without chunks, handle quotes and force data types
    data = pd.read_csv(
        file_path,
        delimiter='\t',
        encoding='ISO-8859-1',
        on_bad_lines='warn',
        nrows=max_rows_to_process,
        quoting=csv.QUOTE_NONE,  # Treat all text literally, do not treat quotes as special
        escapechar='\\',  # Escape special characters if needed
        dtype=str,  # Force all columns to be read as strings
        low_memory=False  # Avoid mixed-type warnings
    )
    
    # Fill NaN values with empty strings to avoid issues with non-iterable types
    data.fillna('', inplace=True)

    # Assign column names (adjust based on actual structure if needed)
    data.columns = ["textID", "words", "date", "country", "source", "url", "headline"]

    # Decode HTML entities in text fields
    data["headline"] = data["headline"].apply(html.unescape)
    data["url"] = data["url"].apply(html.unescape)

    # Filter rows into valid and invalid lists
    for index, row in data.iterrows():
        if all(row.notnull()) and len(row) == 7:  # Ensure all fields are non-null and correctly structured
            valid_rows.append(row)
        else:
            invalid_rows.append(row)

except Exception as e:
    print(f"Error processing the file: {e}")

# Create a DataFrame from the valid rows
sources_22_10 = pd.DataFrame(valid_rows, columns=["textID", "words", "date", "country", "source", "url", "headline"])

# Display invalid rows for debugging, if needed
print(f"Invalid rows: {len(invalid_rows)}")
for row in invalid_rows[:5]:  # Display a sample of invalid rows
    print(row)

# Display the final DataFrame
sources_22_10

# Filter the DataFrame for rows where the "country" column is "US"
sources_22_10 = sources_22_10[sources_22_10["country"] == "US"]

# Display the filtered DataFrame
sources_22_10


Invalid rows: 0


Unnamed: 0,textID,words,date,country,source,url,headline
0,89996010,1177,22-10-01,US,The Baltimore Sun,https://www.baltimoresun.com/education/bs-md-j...,Minority students make up a small fraction of ...
1,89996011,923,22-10-01,US,ABC,https://abcnews.go.com/Sports/wireStory/corum-...,"Corum, McCarthy lead No. 4 Michigan past Iowa,..."
2,89996012,981,22-10-01,US,Associated Press,https://apnews.com/2098f5cb7ebb4fcd42bfccd6f5a...,Raleigh's walk-off homer ends Mariners' long p...
3,89996013,534,22-10-01,US,Associated Press,https://apnews.com/49ca6d17cf1c8e0a8eae8f33de5...,"Max Baer, Pennsylvania Supreme Court's chief j..."
4,89996014,602,22-10-01,US,Associated Press,https://apnews.com/9186f70b3fb426c70d4f8948296...,US women's basketball dominates on internation...
...,...,...,...,...,...,...,...
87591,90136835,430,22-10-31,US,Business Insider,https://markets.businessinsider.com/news/stock...,Full Truck Alliance Co. Ltd. Announces Gross T...
87592,90136708,274,22-10-31,US,ABC,https://abcnews.go.com/Business/wireStory/week...,"This Week: Fed meeting, Starbucks earns, jobs ..."
87593,90136710,260,22-10-31,US,ABC,https://abcnews.go.com/US/dead-chattanooga-sho...,"2 dead in Chattanooga shooting, police say"
87594,90136713,421,22-10-31,US,Bleacher Report,https://bleacherreport.com/articles/10054125-p...,Patrick Peterson: Critical Emails from Cardina...


### Issue might be because of an error with this line " 90045730	260	22-10-11	??	golfweek.usatoday	https://golfweek.usatoday.com/lists/affordable-golf-bags-cart-stand-carry-travel/	Best affordable golf bags: Carry your clubs for less than $200 ". But the rest of the sources have ?? as country, so I doubt they would be included anyway. So we'll just use this for for further analysis.

In [24]:
# Define the file path where you want to save the CSV
output_file_path = "/work/LauraSørineVoldgaard#8191/data/sus_sources/sources_22_10_susnomore.csv"

# Save the filtered DataFrame as a CSV file
sources_22_10.to_csv(output_file_path, index=False, encoding='utf-8')

## Text

In [25]:
# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-10/22-10-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-10/22-10-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-10/22-10-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-10/22-10-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-10/22-10-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_22_10 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
print(text_22_10)


         textID                                               body
0      89996000  Orlando Mayorquin, USA TODAY\nSeptember 30, 20...
1      89996001  There was no way NBA executives could've predi...
2      89996002  Eugene Quaynor, a Ghanan graduate student who ...
3      89996003  The Baltimore Ravens signed defensive tackle M...
4      89996005  The Memphis Grizzlies play against the Milwauk...
...         ...                                                ...
89219  94532880  CHARLESTON, W.Va. ( WV News ) -- The COVID dea...
89220  94532883  CLARKSBURG, W.Va. ( WV News ) -- Nearly 1,800 ...
89221  94532884  SOUTH CHARLESTON, W.Va. ( WV News ) -- Hunting...
89222  94532886  CLARKSBURG, W.Va. ( WV News ) -- Thanksgiving ...
89223  94532888  With West Virginians and all of the nation dea...

[89224 rows x 2 columns]


In [26]:
# Create the "sus_months" dataframe for text_22_10 and sources_22_10
sus_months = pd.DataFrame({
    "month": ["21-08", "22-10"],  # Column for the months
    "sources": [len(sources_21_08), len(sources_22_10)],  # Number of rows in sources dataframes
    "texts": [len(text_21_08), len(text_22_10)]  # Number of rows in text dataframes
})

# Display the sus_months dataframe
print(sus_months)


   month  sources   texts
0  21-08   150012  154411
1  22-10    87596   89224


## 22-05

### Sources

In [27]:
import pandas as pd

file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-22-05.txt"

# Initialize lists to store data
rows = []

try:
    with open(file_path, "r", encoding="ISO-8859-1") as file:
        for line_number, line in enumerate(file, start=1):
            # Split the line using tab as the delimiter
            fields = line.strip().split("\t")
            
            # Check if the line has the correct number of fields
            if len(fields) == 7:
                rows.append(fields)
            else:
                print(f"Malformed line {line_number}: {line[:100]}...")  # Log the malformed line (first 100 chars)

except Exception as e:
    print(f"Error reading the file line-by-line: {e}")

# Convert to a DataFrame
column_names = ["textID", "words", "date", "country", "source", "url", "headline"]
sources_22_05 = pd.DataFrame(rows, columns=column_names)

# Display the DataFrame
sources_22_05

# Filter the DataFrame for rows where the "country" column is "US"
sources_22_05 = sources_22_05[sources_22_05["country"] == "US"]

# Display the filtered DataFrame
sources_22_05


Malformed line 120781: Tettenhall | 15 hours ago
...
Malformed line 126650: 
...
Malformed line 126651: West Bromwich | 7 hours ago
...
Malformed line 134932: South Shropshire | 10 hours ago
...
Malformed line 134946: Toby Neal | 15 hours ago
...
Malformed line 143059: 
...
Malformed line 143060:  Newtown | 6 hours ago
...
Malformed line 146799: Wolverhampton | 7 hours ago
...
Malformed line 155090: Wolverhampton | 5 hours ago
...
Malformed line 157275: 
...
Malformed line 157276: 
...
Malformed line 157277: Wolverhampton | 12 hours ago
...
Malformed line 157294: Telford | 8 hours ago
...
Malformed line 157297: they close gap at the topCricket | 11 hours ago
...
Malformed line 157303: 
...
Malformed line 161662: Football | 15 hours ago
...
Malformed line 163387: Toby Neal | 15 hours ago
...
Malformed line 166914: Staffordshire | 8 hours ago
...
Malformed line 168887: Walsall FC | 12 hours ago
...
Malformed line 168893: Business | 14 hours ago
...
Malformed line 232675: ... 
...


Unnamed: 0,textID,words,date,country,source,url,headline
0,89217290,634,22-05-01,US,HoopsHype,https://hoopshype.com/2022/04/16/x-rays-negati...,"X-rays negative for Scottie Barnes, MRI awaits..."
1,89217291,929,22-05-01,US,Business Insider,https://investorplace.com/2022/05/3-undervalue...,3 Undervalued Stocks to Buy in May 2022
2,89217295,4336,22-05-01,US,YAHOO!News,https://news.yahoo.com/titans-day-3-picks-said...,What Titans' Day 3 picks said after being drafted
3,89217296,1971,22-05-01,US,Seeking Alpha,https://seekingalpha.com/article/4505613-disne...,"Analysis: Disney Could Turn Around, But Invest..."
4,89217297,1456,22-05-01,US,Yahoo! Sports,https://sports.yahoo.com/cavalier-johnson-beco...,Cavalier Johnson becomes first African America...
...,...,...,...,...,...,...,...
85295,91183907,1066,22-05-31,US,nola.com,https://nola.com/news/crime_police/article_76e...,Read the full story
85296,91183908,807,22-05-31,US,nola.com,https://nola.com/entertainment_life/article_3c...,Read the full story
85297,91183909,289,22-05-31,US,nola.com,https://nola.com/news/crime_police/article_1b7...,Read the full story
85298,91183910,193,22-05-31,US,nola.com,https://nola.com/news/northshore/article_c836b...,Read the full story


In [28]:
# Define the file path where you want to save the CSV
output_file_path = "/work/LauraSørineVoldgaard#8191/data/sus_sources/sources_22_05_susnomore.csv"

# Save the filtered DataFrame as a CSV file
sources_22_05.to_csv(output_file_path, index=False, encoding='utf-8')

### Text

In [29]:
# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-05/22-05-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-05/22-05-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-05/22-05-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-05/22-05-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-05/22-05-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_22_05 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
text_22_05

Unnamed: 0,textID,body
0,89217300,The Quincy Lady Orioles took runner-up honors ...
1,89217301,The lawsuit against Family Dollar was filed on...
2,89217302,ALLIANCE -- The city of Alliance and OhioMeans...
3,89217304,""" How did the Sure Power herbicide you referen..."
4,89217305,"DJs From Mars, musicians from Italy, pose at C..."
...,...,...
88875,91183895,"The Clarksburg Water Board, from left, member ..."
88876,91183896,"GRAFTON, W.Va. ( WV News ) -- The city of Graf..."
88877,91183897,"CLARKSBURG, W.Va. ( WV News ) -- When the Harr..."
88878,91183898,"CLARKSBURG, W.VA. -- Ryan and Sarah Rutt, owne..."


In [30]:
# Create the "sus_months" dataframe for text_22_10 and sources_22_10
sus_months = pd.DataFrame({
    "month": ["21-08", "22-10", "22-05"],  # Column for the months
    "sources": [len(sources_21_08), len(sources_22_10), len(sources_22_05)],  # Number of rows in sources dataframes
    "texts": [len(text_21_08), len(text_22_10), len(text_22_05)]  # Number of rows in text dataframes
})

# Display the sus_months dataframe
print(sus_months)


   month  sources   texts
0  21-08   150012  154411
1  22-10    87596   89224
2  22-05    85300   88880


## 24-05

### Sources

In [31]:
# specify the file path
file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-24-05.txt"  # Replace 'your_file_name.txt' with the actual file name

# check if the path is a valid file and ends with '.txt'
if os.path.isfile(file_path) and file_path.endswith('.txt'):
    try:
        # read the file into a dataframe, skipping bad lines
        sources_24_05 = pd.read_csv(
            file_path, 
            delimiter='\t', 
            encoding='ISO-8859-1',
            on_bad_lines='skip'  # Skip lines with too many or too few fields
        )
        
        # define and assign column names
        column_names = ["textID", "words", "date", "country", "source", "url", "headline"]
        sources_24_05.columns = column_names
        
        # display the dataframe
        sources_24_05
    except pd.errors.ParserError as e:
        print(f"ParserError in file: {file_path}")
        print(e)
        # log the problematic file
        with open("/work/LauraSørineVoldgaard#8191/problematic_files.log", "a") as log_file:
            log_file.write(f"ParserError in file: {file_path}\n")
    except Exception as e:
        print(f"General error in file: {file_path}. Error: {e}")
else:
    print(f"The specified file does not exist or is not a '.txt' file: {file_path}")


sources_24_05

# Filter the DataFrame for rows where the "country" column is "US"
sources_24_05 = sources_24_05[sources_24_05["country"] == "US"]

# Display the filtered DataFrame
sources_24_05

Unnamed: 0,textID,words,date,country,source,url,headline
0,200443171,500,24-05-01,US,gizmodo.com,https://gizmodo.com/dave-busters-bet-wagers-be...,Dave & Buster's Adding Bets to Its App as Amer...
1,200443172,815,24-05-01,US,gizmodo.com,https://gizmodo.com/how-to-track-your-medicati...,How to Track Your Medications with iOS and And...
2,200443173,366,24-05-01,US,markets.businessinsider.com,https://markets.businessinsider.com/news/curre...,"Bitcoin will drop 13% to $50,000 after falling..."
3,200443174,912,24-05-01,US,markets.businessinsider.com,https://markets.businessinsider.com/news/stock...,Leading the Way in Broaching Machinery: Taizho...
4,200443175,261,24-05-01,US,markets.businessinsider.com,https://markets.businessinsider.com/news/stock...,"TransMedics Stock Skyrockets, Analysts Call It..."
...,...,...,...,...,...,...,...
58732,108663272,418,24-05-31,US,nbcchicago.com,https://www.nbcchicago.com/news/local/illinois...,Illinois teen drivers need an exclusive DMV ap...
58733,108663275,272,24-05-31,US,nbcchicago.com,https://www.nbcchicago.com/news/business/money...,Mega backdoor Roth conversions can be a `no br...
58734,108663277,545,24-05-31,US,nbcchicago.com,https://www.nbcchicago.com/news/national-inter...,Democratic Sen. Joe Manchin of West Virginia r...
58735,108663278,248,24-05-31,US,nbcchicago.com,https://www.nbcchicago.com/news/local/recall-a...,"Tesla recalling more than 125,000 vehicles to ..."


### Text

In [32]:
# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-05/24-05-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-05/24-05-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-05/24-05-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-05/24-05-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-05/24-05-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_24_05 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
text_24_05

Unnamed: 0,textID,body
0,107886812,Price chart patterns suggest that the uptrend ...
1,107886818,Starbucks shares plummeted Tuesday evening by ...
2,107886819,Amazon reported a strong first quarter after t...
3,107887000,""" I might make a joke about being a cougar and..."
4,107887001,Artificial Intelligence has designed what the ...
...,...,...
59590,200504588,A New York jury on Thursday evening found Dona...
59591,200504589,"4 minute read\nTIME\nMay 30, 1960 12:00 AM GMT..."
59592,200504591,Angel Studios is planning to appeal an arbitra...
59593,200504597,Macomb County Sheriff Anthony Wickersham said ...


In [33]:
# Create the "sus_months" dataframe for text_22_10 and sources_22_10
sus_months = pd.DataFrame({
    "month": ["21-08", "22-10", "22-05", "24-05"],  # Column for the months
    "sources": [len(sources_21_08), len(sources_22_10), len(sources_22_05), len(sources_24_05)],  # Number of rows in sources dataframes
    "texts": [len(text_21_08), len(text_22_10), len(text_22_05), len(text_24_05)]  # Number of rows in text dataframes
})

# Display the sus_months dataframe
print(sus_months)


   month  sources   texts
0  21-08   150012  154411
1  22-10    87596   89224
2  22-05    85300   88880
3  24-05    58737   59595


## 22-09

### Sources

In [34]:
# specify the file path
file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-22-09.txt"  # Replace 'your_file_name.txt' with the actual file name

# check if the path is a valid file and ends with '.txt'
if os.path.isfile(file_path) and file_path.endswith('.txt'):
    try:
        # read the file into a dataframe, skipping bad lines
        sources_22_09 = pd.read_csv(
            file_path, 
            delimiter='\t', 
            encoding='ISO-8859-1',
            on_bad_lines='skip'  # Skip lines with too many or too few fields
        )
        
        # define and assign column names
        column_names = ["textID", "words", "date", "country", "source", "url", "headline"]
        sources_22_09.columns = column_names
        
        # display the dataframe
        sources_22_09
    except pd.errors.ParserError as e:
        print(f"ParserError in file: {file_path}")
        print(e)
        # log the problematic file
        with open("/work/LauraSørineVoldgaard#8191/problematic_files.log", "a") as log_file:
            log_file.write(f"ParserError in file: {file_path}\n")
    except Exception as e:
        print(f"General error in file: {file_path}. Error: {e}")
else:
    print(f"The specified file does not exist or is not a '.txt' file: {file_path}")



# Filter the DataFrame for rows where the "country" column is "US"
sources_22_09 = sources_22_09[sources_22_09["country"] == "US"]

# Display the filtered DataFrame
sources_22_09

  sources_22_09 = pd.read_csv(


Unnamed: 0,textID,words,date,country,source,url,headline
0,93233590,5839.0,22-09-01,US,nytimes.com,https://www.nytimes.com/interactive/2022/09/01...,Focus GroupThese 12 Teachers Have a Word or Tw...
1,93233624,1705.0,22-09-01,US,washingtonpost.com,https://www.washingtonpost.com/health/2022/09/...,BREAKING NEWS
2,93233626,1981.0,22-09-01,US,washingtonpost.com,https://www.washingtonpost.com/science/2022/09...,"In summer of viruses, new disease outbreaks be..."
3,93233627,1327.0,22-09-01,US,washingtonpost.com,https://www.washingtonpost.com/investigations/...,Ginni Thomas pressed Wisconsin lawmakers to ov...
4,93233628,1601.0,22-09-01,US,washingtonpost.com,https://www.washingtonpost.com/politics/2022/0...,POST POLITICS NOW
...,...,...,...,...,...,...,...
108639,93877604,1277.0,22-09-30,US,wvnews.com,https://wvnews.com/bluegoldnews/huggins-west-v...,Huggins: West Virginia basketball schedule ful...
108640,93877606,419.0,22-09-30,US,wvnews.com,https://wvnews.com/news/wvnews/wvu-fraternity-...,"WVU fraternity cleared of hazing allegation, s..."
108641,93877608,729.0,22-09-30,US,wvnews.com,https://wvnews.com/bluegoldnews/wvu-will-learn...,WVU will learn what it's made of against Robin...
108642,93877612,160.0,22-09-30,US,pbs.org,https://pbs.org/video/brooks-and-tumulty-on-pu...,Brooks and Tumulty on Putin&#x27;s war and Rep...


In [35]:
# Define the file path where you want to save the CSV
output_file_path = "/work/LauraSørineVoldgaard#8191/data/sus_sources/sources_22_09_susnomore.csv"

# Save the filtered DataFrame as a CSV file
sources_22_09.to_csv(output_file_path, index=False, encoding='utf-8')

### Text

In [36]:
# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-09/22-09-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-09/22-09-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-09/22-09-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-09/22-09-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-09/22-09-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_22_09 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
text_22_09

Unnamed: 0,textID,body
0,89863200,Italian bonds led losses in the region on Thur...
1,89863202,After the' Transparent' actress landed a role ...
2,89863203,Two of the most anticipated Japanese films sho...
3,89863204,"Notably, Amazon boasts of five successful busi..."
4,89863205,"HOUSTON, Sept. 1, 2022 /PRNewswire/ -- McDermo..."
...,...,...
114871,93877580,His employment began in 1981 as a groundskeepe...
114872,93877585,MORGANTOWN ( WV News ) -- An unexpected visito...
114873,93877586,"CHARLESTON, W.Va. ( WV News ) -- Ten more COVI..."
114874,93877587,"MORGANTOWN, W.Va. ( WV News ) -- CONSOL Energy..."


In [37]:
# Create the "sus_months" dataframe for text_22_10 and sources_22_10
sus_months = pd.DataFrame({
    "month": ["21-08", "22-10", "22-05", "24-05", "22-09"],  # Column for the months
    "sources": [len(sources_21_08), len(sources_22_10), len(sources_22_05), len(sources_24_05), len(sources_22_09)],  # Number of rows in sources dataframes
    "texts": [len(text_21_08), len(text_22_10), len(text_22_05), len(text_24_05), len(text_22_09)]  # Number of rows in text dataframes
})

# Display the sus_months dataframe
print(sus_months)


   month  sources   texts
0  21-08   150012  154411
1  22-10    87596   89224
2  22-05    85300   88880
3  24-05    58737   59595
4  22-09   108644  114876


## 22-06

### Sources

In [38]:
# specify the file path
file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-22-06.txt"  # Replace 'your_file_name.txt' with the actual file name

# check if the path is a valid file and ends with '.txt'
if os.path.isfile(file_path) and file_path.endswith('.txt'):
    try:
        # read the file into a dataframe, skipping bad lines
        sources_22_06 = pd.read_csv(
            file_path, 
            delimiter='\t', 
            encoding='ISO-8859-1',
            on_bad_lines='skip'  # Skip lines with too many or too few fields
        )
        
        # define and assign column names
        column_names = ["textID", "words", "date", "country", "source", "url", "headline"]
        sources_22_06.columns = column_names
        
        # display the dataframe
        sources_22_06
    except pd.errors.ParserError as e:
        print(f"ParserError in file: {file_path}")
        print(e)
        # log the problematic file
        with open("/work/LauraSørineVoldgaard#8191/problematic_files.log", "a") as log_file:
            log_file.write(f"ParserError in file: {file_path}\n")
    except Exception as e:
        print(f"General error in file: {file_path}. Error: {e}")
else:
    print(f"The specified file does not exist or is not a '.txt' file: {file_path}")



# Filter the DataFrame for rows where the "country" column is "US"
sources_22_06 = sources_22_06[sources_22_06["country"] == "US"]

# Display the filtered DataFrame
sources_22_06

  sources_22_06 = pd.read_csv(


Unnamed: 0,textID,words,date,country,source,url,headline
0,89403424,727.0,22-06-01,US,Yahoo,https://www.yahoo.com/lifestyle/lego-star-wars...,'LEGO Star Wars: The Skywalker Saga' is loaded...
1,89403429,269.0,22-06-01,US,Axios,https://www.axios.com/2022/06/01/fbi-iran-hack...,FBI head: Agency blocked Iranian cyberattack o...
2,89403430,169.0,22-06-01,US,Axios,https://www.axios.com/local/twin-cities/2022/0...,Jesse Ventura turns to Substack
3,89403434,992.0,22-06-01,US,Boston.com,https://www.boston.com/community/music-club/me...,"Meet solo artist Adi Sun, one the top singer-s..."
4,89403435,350.0,22-06-01,US,Business Insider,https://www.businessinsider.com/pamplona-priva...,A private equity company wants to cut ties wit...
...,...,...,...,...,...,...,...
80233,89566655,3298.0,22-06-30,US,Yahoo,https://www.yahoo.com/lifestyle/11-men-share-e...,11 men share their experiences of abortion aft...
80234,89566656,614.0,22-06-30,US,Yahoo,https://www.yahoo.com/lifestyle/yes-ice-cream-...,"Yes, You Can Have Ice Cream Delivered Straight..."
80235,89566657,287.0,22-06-30,US,Yahoo,https://www.yahoo.com/news/football-rumours-bo...,Borussia Dortmund set Jude Bellingham's price ...
80236,89566570,384.0,22-06-30,US,The Hill,https://thehill.com/changing-america/enrichmen...,New Jersey is losing residents to states like ...


In [39]:
# Define the file path where you want to save the CSV
output_file_path = "/work/LauraSørineVoldgaard#8191/data/sus_sources/sources_22_06_susnomore.csv"

# Save the filtered DataFrame as a CSV file
sources_22_06.to_csv(output_file_path, index=False, encoding='utf-8')

### Text

In [40]:
# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-06/22-06-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-06/22-06-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-06/22-06-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-06/22-06-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-06/22-06-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_22_06 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
text_22_06

Unnamed: 0,textID,body
0,91204800,Secretary of State Antony Blinken pushed back ...
1,91204802,Medical records indicated the 4-month-old died...
2,91204803,"Two decades later, French cinema still has eno..."
3,91204804,You might have noticed that many of the same R...
4,91204805,An antiabortion advocate prays outside the Sup...
...,...,...
85024,91852197,"Raleigh, N.C. -- State House lawmakers have gi..."
85025,91852198,Top General Assembly leaders said there was no...
85026,91852199,The budget puts aside $1 billion into a new St...
85027,91852280,


In [41]:
# Create the "sus_months" dataframe for text_22_10 and sources_22_10
sus_months = pd.DataFrame({
    "month": ["21-08", "22-10", "22-05", "24-05", "22-09", "22-06"],  # Column for the months
    "sources": [len(sources_21_08), len(sources_22_10), len(sources_22_05), len(sources_24_05), len(sources_22_09), len(sources_22_06)],  # Number of rows in sources dataframes
    "texts": [len(text_21_08), len(text_22_10), len(text_22_05), len(text_24_05), len(text_22_09), len(text_22_06)]  # Number of rows in text dataframes
})

# Display the sus_months dataframe
print(sus_months)


   month  sources   texts
0  21-08   150012  154411
1  22-10    87596   89224
2  22-05    85300   88880
3  24-05    58737   59595
4  22-09   108644  114876
5  22-06    80238   85029


## 22-03

### Sources

In [42]:
# specify the file path
file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-22-03.txt"  # Replace 'your_file_name.txt' with the actual file name

# check if the path is a valid file and ends with '.txt'
if os.path.isfile(file_path) and file_path.endswith('.txt'):
    try:
        # read the file into a dataframe, skipping bad lines
        sources_22_03 = pd.read_csv(
            file_path, 
            delimiter='\t', 
            encoding='ISO-8859-1',
            on_bad_lines='skip'  # Skip lines with too many or too few fields
        )
        
        # define and assign column names
        column_names = ["textID", "words", "date", "country", "source", "url", "headline"]
        sources_22_03.columns = column_names
        
        # display the dataframe
        sources_22_03
    except pd.errors.ParserError as e:
        print(f"ParserError in file: {file_path}")
        print(e)
        # log the problematic file
        with open("/work/LauraSørineVoldgaard#8191/problematic_files.log", "a") as log_file:
            log_file.write(f"ParserError in file: {file_path}\n")
    except Exception as e:
        print(f"General error in file: {file_path}. Error: {e}")
else:
    print(f"The specified file does not exist or is not a '.txt' file: {file_path}")



# Filter the DataFrame for rows where the "country" column is "US"
sources_22_03 = sources_22_03[sources_22_03["country"] == "US"]

# Display the filtered DataFrame
sources_22_03

  sources_22_03 = pd.read_csv(


Unnamed: 0,textID,words,date,country,source,url,headline
0,53716555,524.0,22-03-01,US,myfox8.com,https://myfox8.com/news/north-carolina/north-c...,North Carolina Department of Revenue is ready ...
1,53716562,446.0,22-03-01,US,statesman.com,https://www.statesman.com/story/news/politics/...,Greg Casar declares victory in Texas primary f...
2,53716571,355.0,22-03-01,US,engadget.com,https://www.engadget.com/apple-russia-online-s...,Apple halts sales of all products in Russia | ...
3,53716846,664.0,22-03-01,US,dailyprogress.com,https://dailyprogress.com/lifestyles/cities-wi...,Cities with the most people taking road trips ...
4,53716849,902.0,22-03-01,US,dailyprogress.com,https://dailyprogress.com/news/national/govt-a...,"New Mexico Gov. signs education bills, $10k te..."
...,...,...,...,...,...,...,...
128956,89014813,722.0,22-03-31,US,Minneapolis Star Tribune on MSN.com,https://www.msn.com/en-us/sports/nhl/seven-gam...,Seven-game win streak over for Wild after OT l...
128957,89014817,678.0,22-03-31,US,Orlando Sentinel,https://www.orlandosentinel.com/sports/florida...,Gators coach Billy Napier will be aggressive i...
128958,89014820,1018.0,22-03-31,US,Sports Illustrated,https://www.si.com/curling/news/men-with-broom...,Men With Brooms Turns 20
128959,89014821,200.0,22-03-31,US,Sports Illustrated,https://www.si.com/tv/mlb-spring-training/chic...,How to Watch Chicago White Sox at Cincinnati R...


In [43]:
# Define the file path where you want to save the CSV
output_file_path = "/work/LauraSørineVoldgaard#8191/data/sus_sources/sources_22_03_susnomore.csv"

# Save the filtered DataFrame as a CSV file
sources_22_03.to_csv(output_file_path, index=False, encoding='utf-8')

### Text

In [44]:
# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-03/22-03-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-03/22-03-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-03/22-03-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-03/22-03-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-22-03/22-03-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_22_03 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
text_22_03

Unnamed: 0,textID,body
0,33825603,Shares decline after Russia invades Ukraine.\n...
1,33825610,Satellite images taken on Monday show a Russia...
2,33825612,Overall employment might be the easiest way to...
3,33825613,"DRESDEN, Germany -- First vaccine opponents at..."
4,33825614,The Turkish and Russian leaders have found the...
...,...,...
136903,89014893,"Twitter has over 1 billion accounts, but only ..."
136904,89014894,The West has proclaimed Ukraine's victory in t...
136905,89014895,""" After much thought and consideration, the Ra..."
136906,89014896,During an interview with GB News anchor Dan Wo...


In [45]:
# Create the "sus_months" dataframe for text_22_10 and sources_22_10
sus_months = pd.DataFrame({
    "month": ["21-08", "22-10", "22-05", "24-05", "22-09", "22-06", "22-03"],  # Column for the months
    "sources": [len(sources_21_08), len(sources_22_10), len(sources_22_05), len(sources_24_05), len(sources_22_09), len(sources_22_06), len(sources_22_03)],  # Number of rows in sources dataframes
    "texts": [len(text_21_08), len(text_22_10), len(text_22_05), len(text_24_05), len(text_22_09), len(text_22_06), len(text_22_03)]  # Number of rows in text dataframes
})

# Display the sus_months dataframe
print(sus_months)


   month  sources   texts
0  21-08   150012  154411
1  22-10    87596   89224
2  22-05    85300   88880
3  24-05    58737   59595
4  22-09   108644  114876
5  22-06    80238   85029
6  22-03   128961  136908


# 23-03

## Sources

In [3]:
import pandas as pd
import html
import csv  # Required for quoting constants like QUOTE_NONE

file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-23-03.txt"

# Initialize lists to hold valid and invalid rows
valid_rows = []
invalid_rows = []

# Set maximum row limit
max_rows_to_process = 313875

try:
    # Read the file without chunks, handle quotes and force data types
    data = pd.read_csv(
        file_path,
        delimiter='\t',
        encoding='ISO-8859-1',
        on_bad_lines='warn',
        nrows=max_rows_to_process,
        quoting=csv.QUOTE_NONE,  # Treat all text literally, do not treat quotes as special
        escapechar='\\',  # Escape special characters if needed
        dtype=str,  # Force all columns to be read as strings
        low_memory=False  # Avoid mixed-type warnings
    )
    
    # Fill NaN values with empty strings to avoid issues with non-iterable types
    data.fillna('', inplace=True)

    # Assign column names (adjust based on actual structure if needed)
    data.columns = ["textID", "words", "date", "country", "source", "url", "headline"]

    # Decode HTML entities in text fields
    data["headline"] = data["headline"].apply(html.unescape)
    data["url"] = data["url"].apply(html.unescape)

    # Filter rows into valid and invalid lists
    for index, row in data.iterrows():
        if all(row.notnull()) and len(row) == 7:  # Ensure all fields are non-null and correctly structured
            valid_rows.append(row)
        else:
            invalid_rows.append(row)

except Exception as e:
    print(f"Error processing the file: {e}")

# Create a DataFrame from the valid rows
sources_23_03 = pd.DataFrame(valid_rows, columns=["textID", "words", "date", "country", "source", "url", "headline"])

# Display invalid rows for debugging, if needed
print(f"Invalid rows: {len(invalid_rows)}")
for row in invalid_rows[:5]:  # Display a sample of invalid rows
    print(row)

# Display the final DataFrame
sources_23_03

# Filter the DataFrame for rows where the "country" column is "US"
sources_23_03 = sources_23_03[sources_23_03["country"] == "US"]

# Display the filtered DataFrame
sources_23_03


Invalid rows: 0


Unnamed: 0,textID,words,date,country,source,url,headline
0,90659700,827,23-03-01,US,YAHOO!News,https://news.yahoo.com/belarus-leader-fully-su...,Belarus leader 'fully supports' China's Ukrain...
1,90659707,184,23-03-01,US,YAHOO!News,https://news.yahoo.com/jalen-carter-charged-re...,Jalen Carter charged with reckless driving in ...
2,90659710,2344,23-03-01,US,YAHOO!News,https://news.yahoo.com/male-contraceptive-pill...,`The male contraceptive pill is on its way - b...
3,90659714,415,23-03-01,US,New York Post,https://nypost.com/2023/02/09/ex-fbi-agent-nic...,Ex-FBI agent Nicole Parker: Bureau `politicall...
4,90659753,441,23-03-01,US,New York Post,https://nypost.com/2023/02/28/brendan-fraser-r...,Brendan Fraser reveals he almost died shooting...
...,...,...,...,...,...,...,...
72940,97896045,331,23-03-31,US,wvnews.com,https://wvnews.com/prestoncountynews/news/tunn...,Tunnelton man charged with making explosive de...
72941,97896046,694,23-03-31,US,wvnews.com,https://wvnews.com/sports/highschool/grafton-g...,Grafton girls track take 7th in season opening...
72942,97896047,805,23-03-31,US,wvnews.com,https://wvnews.com/sports/highschool/rcb-softb...,RCB softball scores 31 runs in seven innings t...
72943,97896048,837,23-03-31,US,wvnews.com,https://wvnews.com/sports/highschool/rcb-withs...,RCB withstands Elkins' late comeback


## Texts

In [4]:
import os
import re

# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-23-03/23-03-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-23-03/23-03-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-23-03/23-03-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-23-03/23-03-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-23-03/23-03-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_23_03 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
print(text_23_03)


          textID                                               body
0       90659700  Belarus President Alexander Lukashenko told hi...
1       90659707  On Wednesday at 10:30 AM at the NFL Scouting C...
2       90659710  Eloise Hendy\nMarch 1, 2023, 8:07 AM*9 min rea...
3       90659714  Former FBI Special Agent Nicole Parker testifi...
4       90659803  An image of Douglas Rushkoff, a man who is kno...
...          ...                                                ...
149103  97895897  John Allore was riding his bicycle around 7 a....
149104  97895898  The images, which first appeared on a Facebook...
149105  97895899  " The best chance for storms will be from 10 a...
149106  97895981  Amazon has impressive deals including the ARM ...
149107  97895988  The Town of Waynesville had a strong plan befo...

[149108 rows x 2 columns]


In [5]:
# Convert 'textID' to numeric, and raise an error if there are invalid values
sources_23_03['textID'] = pd.to_numeric(sources_23_03['textID'], errors='raise')

# Convert 'textID' to integers
sources_23_03['textID'] = sources_23_03['textID'].astype(int)

# Verify the data type
print(sources_23_03.dtypes)  # 'textID' should now be int64

textID       int64
words       object
date        object
country     object
source      object
url         object
headline    object
dtype: object


In [7]:
print(len(sources_23_03))
print(len(text_23_03))
#print(len(merged_23_03))

72945
149108


In [8]:
print(sources_23_03['textID'].duplicated().sum())  # Number of duplicate rows in sources_23_03
print(text_23_03['textID'].duplicated().sum())    # Number of duplicate rows in text_23_03


0
76165


In [9]:
# number of duplicate rows (before removing the NaN in body column rows)
print(text_23_03['textID'].duplicated().sum())    # Number of duplicate rows

# number of duplicate rows (after removing the NaN in body column rows)
duplicates = text_23_03[text_23_03['body'].duplicated(keep=False)]
duplicates


76165


Unnamed: 0,textID,body
0,90659700,Belarus President Alexander Lukashenko told hi...
1,90659707,On Wednesday at 10:30 AM at the NFL Scouting C...
2,90659710,"Eloise Hendy\nMarch 1, 2023, 8:07 AM*9 min rea..."
3,90659714,Former FBI Special Agent Nicole Parker testifi...
4,90659803,"An image of Douglas Rushkoff, a man who is kno..."
...,...,...
149102,97895894,A WRAL Weather Alert Day has been issued for S...
149103,97895897,John Allore was riding his bicycle around 7 a....
149104,97895898,"The images, which first appeared on a Facebook..."
149105,97895899,""" The best chance for storms will be from 10 a..."


In [10]:
# Count duplicates before removal
print(f"Number of duplicate rows (all columns): {text_23_03.duplicated().sum()}")
print(f"Number of duplicate rows (textID only): {text_23_03['textID'].duplicated().sum()}")

# Remove duplicates based on the entire row (all columns)
text_23_03 = text_23_03.drop_duplicates()

# Remove duplicates based only on 'textID' column
text_23_03 = text_23_03.drop_duplicates(subset=['textID'])

# Remove duplicates based on 'body' column and keep the first occurrence
text_23_03 = text_23_03.drop_duplicates(subset=['body'], keep='first')

# Display number of duplicates after removal
print(f"Number of duplicate rows (all columns): {text_23_03.duplicated().sum()}")



Number of duplicate rows (all columns): 74549
Number of duplicate rows (textID only): 76165
Number of duplicate rows (all columns): 0


In [11]:
# display text_23_03 without duplicate rows
text_23_03

Unnamed: 0,textID,body
0,90659700,Belarus President Alexander Lukashenko told hi...
1,90659707,On Wednesday at 10:30 AM at the NFL Scouting C...
2,90659710,"Eloise Hendy\nMarch 1, 2023, 8:07 AM*9 min rea..."
3,90659714,Former FBI Special Agent Nicole Parker testifi...
4,90659803,"An image of Douglas Rushkoff, a man who is kno..."
...,...,...
135320,97895894,A WRAL Weather Alert Day has been issued for S...
135321,97895897,John Allore was riding his bicycle around 7 a....
135322,97895898,"The images, which first appeared on a Facebook..."
135323,97895899,""" The best chance for storms will be from 10 a..."


In [12]:
merged_23_03 = sources_23_03.merge(text_23_03, on = "textID", how = "inner") # join sources dataframe with their corresponding article content using the textIDs, keeping only the articles where text IDs match. How = left to ensure that it is a dataframe and series, not a list
merged_23_03

Unnamed: 0,textID,words,date,country,source,url,headline,body
0,90659700,827,23-03-01,US,YAHOO!News,https://news.yahoo.com/belarus-leader-fully-su...,Belarus leader 'fully supports' China's Ukrain...,Belarus President Alexander Lukashenko told hi...
1,90659707,184,23-03-01,US,YAHOO!News,https://news.yahoo.com/jalen-carter-charged-re...,Jalen Carter charged with reckless driving in ...,On Wednesday at 10:30 AM at the NFL Scouting C...
2,90659710,2344,23-03-01,US,YAHOO!News,https://news.yahoo.com/male-contraceptive-pill...,`The male contraceptive pill is on its way - b...,"Eloise Hendy\nMarch 1, 2023, 8:07 AM*9 min rea..."
3,90659714,415,23-03-01,US,New York Post,https://nypost.com/2023/02/09/ex-fbi-agent-nic...,Ex-FBI agent Nicole Parker: Bureau `politicall...,Former FBI Special Agent Nicole Parker testifi...
4,90659753,441,23-03-01,US,New York Post,https://nypost.com/2023/02/28/brendan-fraser-r...,Brendan Fraser reveals he almost died shooting...,""" I was choked out accidentally, "" said Fraser..."
...,...,...,...,...,...,...,...,...
71009,97896045,331,23-03-31,US,wvnews.com,https://wvnews.com/prestoncountynews/news/tunn...,Tunnelton man charged with making explosive de...,TUNNELTON -- A Tunnelton man was charged with ...
71010,97896046,694,23-03-31,US,wvnews.com,https://wvnews.com/sports/highschool/grafton-g...,Grafton girls track take 7th in season opening...,"BRIDGEPORT, W.Va. ( WV News ) -- The Grafton g..."
71011,97896047,805,23-03-31,US,wvnews.com,https://wvnews.com/sports/highschool/rcb-softb...,RCB softball scores 31 runs in seven innings t...,"CLARKSBURG, W.Va. ( WV News ) -- Robert C. Byr..."
71012,97896048,837,23-03-31,US,wvnews.com,https://wvnews.com/sports/highschool/rcb-withs...,RCB withstands Elkins' late comeback,"CLARKSBURG, W.Va. ( WV News ) -- A day after e..."


In [13]:
# Define the file path where you want to save the CSV
output_file_path = "/work/LauraSørineVoldgaard#8191/data/merged_23_03.csv"

# Save the filtered DataFrame as a CSV file
merged_23_03.to_csv(output_file_path, index=False, encoding='utf-8')

# 24-11

## Sources

In [6]:
import pandas as pd

file_path = "/work/LauraSørineVoldgaard#8191/data/sources/sources-24-11.txt"

# Set maximum rows to process
max_rows_to_process = 313875

try:
    # Read the file line by line and split manually on tab character
    with open(file_path, encoding='ISO-8859-1') as file:
        lines = file.readlines()[:max_rows_to_process]  # Limit rows if needed
    
    # Manually split each line on the tab character
    data = [line.strip().split('\t') for line in lines]

    # Convert to a DataFrame
    raw_data = pd.DataFrame(data)

    # Assign column names (adjust these names based on your data)
    raw_data.columns = ["textID", "words", "date", "country", "source", "url", "headline"]

    # Inspect the DataFrame
    #print(raw_data.head())

except Exception as e:
    print(f"Error processing the file: {e}")

# Filter for rows where country is "US"
sources_24_11 = raw_data[raw_data["country"] == "US"]

# Save the cleaned and filtered DataFrame
sources_24_11.to_csv("filtered-sources-24-11.csv", index=False)

# Display the filtered DataFrame
sources_24_11


Unnamed: 0,textID,words,date,country,source,url,headline
0,116932171,158,24-11-01,US,jpost.com,https://www.jpost.com/israel-news/article-826346\,leaked to the Israeli media
1,116975644,435,24-11-01,US,nbcbayarea.com,https://www.nbcbayarea.com/news/local/san-fran...,United Airlines attack
2,116975645,137,24-11-01,US,nbcbayarea.com,https://www.nbcbayarea.com/news/local/san-fran...,Nima Momeni trial
3,116975650,211,24-11-01,US,nbcbayarea.com,https://www.nbcbayarea.com/decision-2024/harri...,"Harris, Trump campaigns make final West Coast ..."
4,116975651,275,24-11-01,US,nbcbayarea.com,https://www.nbcbayarea.com/news/local/embattle...,Embattled San Jose councilman facing more call...
...,...,...,...,...,...,...,...
107982,118313484,957,24-11-30,US,zdnet.com,https://zdnet.com/article/one-of-my-favorite-b...,One of my favorite big-screen tablets for wat...
107983,118313486,77,24-11-30,US,zdnet.com,https://zdnet.com/article/best-black-friday-be...,The 30+ best live Black Friday Best Buy deals...
107984,118313498,1190,24-11-30,US,zdnet.com,https://zdnet.com/article/the-latest-version-o...,The latest version of my favorite Garmin smar...
107985,118313502,1492,24-11-30,US,zdnet.com,https://zdnet.com/article/the-2-in-1-laptop-i-...,The 2-in-1 laptop I recommend to most people ...


In [11]:
# Exclude rows with specific dates
included_dates = ['24-11-01', '24-11-02', '24-11-03', '24-11-04', '24-11-05']
df_filtered = sources_24_11[sources_24_11['date'].isin(included_dates)]

# Display the filtered dataframe
df_filtered

# Save the filtered dataframe back to a new CSV file
df_filtered.to_csv("filtered_sources-24-11.csv", index=False)

## Texts

In [4]:
import os
import re

# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-11/24-11-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-11/24-11-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-11/24-11-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-11/24-11-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-24-11/24-11-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_24_11 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
print(text_24_11)


Mismatch in IDs and articles in file: /work/LauraSørineVoldgaard#8191/data/text/text-24-11/24-11-us3.txt
Number of IDs: 21463, Number of Articles: 21462
Empty article body detected for textID @@118192649
          textID                                               body
0      116953714  Former President Bill Clinton rallied a crowd ...
1      116953716  Besides delivering gas to homes, the pipelines...
2      116953717  Starting this month, Central Maine Power Co. c...
3      116953718  Lawmakers have scaled back an ambitious propos...
4      116953719  Gun rights advocates hold signs among the gun ...
...          ...                                                ...
85743  118313294  A New Hampshire man is being remembered as a r...
85744  118313297  A woman is OK after she fell asleep at the whe...
85745  118313484  Why you can trust ZDNET: ZDNET independently t...
85746  118313486  Why you can trust ZDNET: ZDNET's expert staff ...
85747  118313498  Why you can trust ZDNET: ZDNET

In [10]:
# Convert 'textID' to integers
text_24_11['textID'] = text_24_11['textID'].astype(int)
sources_24_11['textID'] = sources_24_11['textID'].astype(int)


# Verify the data type
print(text_24_11.dtypes)  # 'textID' should now be int64
print(sources_24_11.dtypes)

textID     int64
body      object
dtype: object
textID       int64
words       object
date        object
country     object
source      object
url         object
headline    object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sources_24_11['textID'] = sources_24_11['textID'].astype(int)


In [11]:
merged_24_11 = sources_24_11.merge(text_24_11, on = "textID", how = "inner") # join sources dataframe with their corresponding article content using the textIDs, keeping only the articles where text IDs match. How = left to ensure that it is a dataframe and series, not a list
merged_24_11

Unnamed: 0,textID,words,date,country,source,url,headline,body
0,116977976,358,24-11-01,US,oregonlive.com,https://www.oregonlive.com/pacific-northwest-n...,Pacific Northwest2 Oregon mushroom hunters res...,Two Oregon mushroom hunters have been rescued ...
1,116977978,319,24-11-01,US,oregonlive.com,https://www.oregonlive.com/crime/2024/11/vanco...,Vancouver Mall shooting was a `targeted attack...,"As of Friday morning, the shooter was still at..."
2,116977979,817,24-11-01,US,oregonlive.com,https://www.oregonlive.com/clark-county/2024/1...,"Halloween shooting at Vancouver Mall kills 1, ...",One person was killed and two injured in a sho...
3,116977981,156,24-11-01,US,oregonlive.com,https://www.oregonlive.com/politics/2024/11/bo...,Boarded windows return to downtown Portland ah...,"Updated: Nov. 01, 2024, 10:59 a.m.\nPublished:..."
4,116977983,658,24-11-01,US,oregonlive.com,https://www.oregonlive.com/ducks/2024/11/orego...,Oregon football vs. Michigan preview: Ducks ma...,"Updated: Nov. 01, 2024, 2:42 p.m.\nPublished: ..."
...,...,...,...,...,...,...,...,...
85164,118313484,957,24-11-30,US,zdnet.com,https://zdnet.com/article/one-of-my-favorite-b...,One of my favorite big-screen tablets for wat...,Why you can trust ZDNET: ZDNET independently t...
85165,118313486,77,24-11-30,US,zdnet.com,https://zdnet.com/article/best-black-friday-be...,The 30+ best live Black Friday Best Buy deals...,Why you can trust ZDNET: ZDNET's expert staff ...
85166,118313498,1190,24-11-30,US,zdnet.com,https://zdnet.com/article/the-latest-version-o...,The latest version of my favorite Garmin smar...,Why you can trust ZDNET: ZDNET independently t...
85167,118313502,1492,24-11-30,US,zdnet.com,https://zdnet.com/article/the-2-in-1-laptop-i-...,The 2-in-1 laptop I recommend to most people ...,Why you can trust ZDNET: ZDNET independently t...


# 20-11

## Texts

In [12]:
import os
import re

# List of file paths to process
file_paths = [
    "/work/LauraSørineVoldgaard#8191/data/text/text-20-11/20-11-us1.txt",  # Replace with your actual file names
    "/work/LauraSørineVoldgaard#8191/data/text/text-20-11/20-11-us2.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-20-11/20-11-us3.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-20-11/20-11-us4.txt",
    "/work/LauraSørineVoldgaard#8191/data/text/text-20-11/20-11-us5.txt",
]

# Initialize an empty list to hold dataframes for each file
all_texts = []

# Loop through the list of files
for file_path in file_paths:
    # Check if the file exists
    if os.path.isfile(file_path) and file_path.endswith('.txt'):
        # Open the file and read its contents into a string
        with open(file_path, 'r', encoding='utf-8') as file:
            raw_text = file.read()  # Read all text data from the file

        # Preprocess the text
        sample = raw_text
        sample = re.sub(r" ([.,?!':])", r"\1", sample)  # Remove spaces before punctuation
        sample = re.sub(r"@ @ @ @ @ @ @ @ @ @", "CENSOREDfrfrfr", sample)  # Replace the chosen censor keyword

        # Split the text into articles based on '@@' markers
        article_ids = re.findall(r"@@(\d+)", sample)  # Extract all article IDs
        articles = re.split(r'"?@@\d+ ', sample)[1:]  # Split articles by article IDs
        articles = [art[art.find("<p> ") + 4:].strip().replace(" <p> ", "\n") for art in articles]  # Process article content

        # Check if the number of IDs matches the number of articles
        if len(article_ids) == len(articles):
            # Create a dataframe with textIDs and article contents
            text = pd.DataFrame(data=dict(textID=article_ids, body=articles))
            text["textID"] = text["textID"].astype(int)  # Convert textID to integer for merging compatibility
            all_texts.append(text)  # Append the dataframe to the list
        else:
            # Handle mismatch between IDs and articles
            print(f"Mismatch in IDs and articles in file: {file_path}")
            print(f"Number of IDs: {len(article_ids)}, Number of Articles: {len(articles)}")

            # Debug the mismatch
            for idx, (article_id, article_body) in enumerate(zip(article_ids, articles)):
                if article_body.strip() == "":  # Check for empty article bodies
                    print(f"Empty article body detected for textID @@{article_id}")
                    break
            else:
                # If no empty bodies, identify extra IDs or articles
                if len(article_ids) > len(articles):
                    print(f"Extra textID found: @@{article_ids[len(articles)]}")
                elif len(articles) > len(article_ids):
                    print(f"Extra article body detected: {articles[len(article_ids)]}")
    else:
        print(f"The file does not exist or is not a valid '.txt' file: {file_path}")

# Concatenate all dataframes from the all_texts list into one dataframe
text_20_11 = pd.concat(all_texts, ignore_index=True)

# Display the final dataframe
print(text_20_11)


          textID                                               body
0       32188714  This Halloween, a new generation of hosts are ...
1       32188715  Twenty-five years before he was elected presid...
2       32188716  Four years ago, I did n't have a plan. Donald ...
3       32188719  The HSE said that while evidence is emerging o...
4       32188800  SAN JOSE -- A San Jose police officer pleaded ...
...          ...                                                ...
132326  86347289  Get the Morning Brief sent directly to your in...
132327  86347291  The Transportation Security Administration, or...
132328  86347296  The new rules are " like booby traps " for the...
132329  86347298  This Rutgers season has arrived to the point t...
132330  86347299  Supporters of President Trump protesting in La...

[132331 rows x 2 columns]


In [13]:
# read csv so you don't have to run all the above again
file_path = "/work/LauraSørineVoldgaard#8191/data/2020_sources/sources-20-11.csv"
sources_20_11 = pd.read_csv(file_path)
sources_20_11

Unnamed: 0,textID,words,date,country,source,url,headline
0,32188665,176,20-11-01,US,theguardian.com,https://amp.theguardian.com/environment/2020/o...,Chameleon last seen a century ago rediscovered...
1,32188666,803,20-11-01,US,theguardian.com,https://amp.theguardian.com/world/live/2020/oc...,"Italian cases jump by 31,000 in a day - as it ..."
2,32188667,756,20-11-01,US,washingtonpost.com,https://www.washingtonpost.com/opinions/what-t...,Opinion | What Trump and Biden's travel schedu...
3,32188714,1269,20-11-01,US,nytimes.com,https://www.nytimes.com/2020/10/30/arts/televi...,TV's Horror Hosts: 70 Years of Screams and Che...
4,32188715,1543,20-11-01,US,nytimes.com,https://www.nytimes.com/2020/10/30/business/tr...,How a Century of Real-Estate Tax Breaks Enrich...
...,...,...,...,...,...,...,...
162005,86347342,128,20-11-30,US,Washington Post,https://www.washingtonpost.com/opinions/letter...,The red wolves can breathe new life into Virgi...
162006,86347343,706,20-11-30,US,Washington Post,https://www.washingtonpost.com/politics/un-pan...,UN: Pandemic to fan surge in humanitarian need...
162007,86347346,1031,20-11-30,US,WLWT,https://www.wlwt.com/article/nbc-announces-lin...,"NBC announces lineup of Christmas, holiday spe..."
162008,86347351,443,20-11-30,US,ZDNet,https://www.zdnet.com/article/ai-approach-coul...,AI approach could solve the problem of ROI for...


In [14]:
# Convert 'textID' to integers
text_20_11['textID'] = text_20_11['textID'].astype(int)
sources_20_11['textID'] = sources_20_11['textID'].astype(int)


# Verify the data type
print(text_20_11.dtypes)  # 'textID' should now be int64
print(sources_20_11.dtypes)

textID     int64
body      object
dtype: object
textID       int64
words        int64
date        object
country     object
source      object
url         object
headline    object
dtype: object


In [15]:
merged_20_11 = sources_20_11.merge(text_20_11, on = "textID", how = "inner") # join sources dataframe with their corresponding article content using the textIDs, keeping only the articles where text IDs match. How = left to ensure that it is a dataframe and series, not a list
merged_20_11

Unnamed: 0,textID,words,date,country,source,url,headline,body
0,32188665,176,20-11-01,US,theguardian.com,https://amp.theguardian.com/environment/2020/o...,Chameleon last seen a century ago rediscovered...,Scientists have found an elusive chameleon spe...
1,32188666,803,20-11-01,US,theguardian.com,https://amp.theguardian.com/world/live/2020/oc...,"Italian cases jump by 31,000 in a day - as it ...",England is expected to go into national lockdo...
2,32188714,1269,20-11-01,US,nytimes.com,https://www.nytimes.com/2020/10/30/arts/televi...,TV's Horror Hosts: 70 Years of Screams and Che...,"This Halloween, a new generation of hosts are ..."
3,32188715,1543,20-11-01,US,nytimes.com,https://www.nytimes.com/2020/10/30/business/tr...,How a Century of Real-Estate Tax Breaks Enrich...,Twenty-five years before he was elected presid...
4,32188716,2424,20-11-01,US,nytimes.com,https://www.nytimes.com/2020/10/31/your-money/...,Investing for the Future in the United States ...,"Four years ago, I did n't have a plan. Donald ..."
...,...,...,...,...,...,...,...,...
130948,86347342,128,20-11-30,US,Washington Post,https://www.washingtonpost.com/opinions/letter...,The red wolves can breathe new life into Virgi...,I am pleased to read there is talk of reintrod...
130949,86347343,706,20-11-30,US,Washington Post,https://www.washingtonpost.com/politics/un-pan...,UN: Pandemic to fan surge in humanitarian need...,"By Associated Press\nDecember 1, 2020 at 12:02..."
130950,86347346,1031,20-11-30,US,WLWT,https://www.wlwt.com/article/nbc-announces-lin...,"NBC announces lineup of Christmas, holiday spe...","NBC has announced its holiday lineup, complete..."
130951,86347351,443,20-11-30,US,ZDNet,https://www.zdnet.com/article/ai-approach-coul...,AI approach could solve the problem of ROI for...,COVID has created CENSOREDfrfrfr\nPhoto: Tom F...
