# RTF Conversion

This is the initial stage, where we convert all the raw rtf files into a .csv file that we can then use with Pandas for later parts of the data processing and analysis.

In [1]:
import os

final_list = []

output_folder = "revised_complete_rtf_files"
directory = r"/Users/RyanChan/Desktop/University/3B-Spring23/DH-412 History and the digital/all-years-only-rtf"
# iterate over files in
# that directory
for filename in os.listdir(directory):
    file = os.path.join(directory, filename)
    if os.path.isfile(file):
        final_list.append(file)

print(final_list[0:5])

['/Users/RyanChan/Desktop/University/3B-Spring23/DH-412 History and the digital/all-years-only-rtf/1993a-201-300-Primer of populist progressives; They claim subtle and complex differences from DFL liberals.rtf', '/Users/RyanChan/Desktop/University/3B-Spring23/DH-412 History and the digital/all-years-only-rtf/2002a-101-200-Pretty Poison.rtf', "/Users/RyanChan/Desktop/University/3B-Spring23/DH-412 History and the digital/all-years-only-rtf/1987-01-100-RECIFE JOURNAL; BRAZIL 'S FLESHPOTS BRING TOURISTS AND A BACKLASH.rtf", '/Users/RyanChan/Desktop/University/3B-Spring23/DH-412 History and the digital/all-years-only-rtf/2010b-01-80-Life in U.S. Brings Success and Visibility For Muslim Women.rtf', '/Users/RyanChan/Desktop/University/3B-Spring23/DH-412 History and the digital/all-years-only-rtf/1990a-201-300-Beating Time Warner at Its Own Game.rtf']


In [2]:
from striprtf.striprtf import rtf_to_text
import re
from datetime import datetime

def rtf_dict(file):
    with open(file, 'r', encoding='utf-8', errors='ignore') as infile:
        content = infile.read()
        text = rtf_to_text(content).strip()
    # print(text)
    # Extract the values for each key
    if "Body" in text:

        sub_list = ["The New York Times", "St. Louis Post-Dispatch (Missouri)", 
            "Christian Science Monitor (Boston, MA)", "Star Tribune (Minneapolis, MN)", 
            "Pittsburgh Post-Gazette (Pennsylvania)", "The Atlanta Journal and Constitution",
            "St. Petersburg Times (Florida)", "The Philadelphia Inquirer", "USA TODAY", 
            "The Tampa Tribune (Florida)", "Journal of Commerce", "Daily News (New York)", 
            "Philadelphia Daily News"]
        
        title = "TITLE NOT FOUND"
        for i in range(len(sub_list)):
            if sub_list[i] in text:
                title = text.split(sub_list[i])[0]

        publisher = "PUBLISHER NOT FOUND"
        for i in range(len(sub_list)):
            if text.find(sub_list[i]) != -1:
                publisher = sub_list[i]
                break

        try:
            match = re.search(r'(?i)\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May?|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?) (0?[1-9]|[12][0-9]|3[01]), [0-9]+', text)
            date = match.group()

            date = datetime.strptime(date, '%B %d, %Y')
            
            year = date.strftime("%Y")
            month = date.strftime("%m")
            day = date.strftime("%d")
        except AttributeError:
            year = "0000"
            month = "00"
            day = "00"   
        except ValueError:
            year = "0000"
            month = "00"
            day = "00"

        full_text = "\n".join(text.split("\n")[text.split("\n").index("Body")+1:text.split('\n').index("End of Document")]).strip().strip('\n')


        dictionary = {"title": title, "publisher": publisher, "year": year, "month": month, "day": day, "full text": full_text}
        
        
        return dictionary
    else:
        return None

# Print the dictionary
# print(dictionary)

In [3]:
import os
folder_path = output_folder


dict_lst = []
for file in final_list:
    if "(GRAPHIC ONLY)" in file or "(PHOTO ONLY)" in file:
        continue
    dct = rtf_dict(os.path.join(folder_path, file))
    if dct is not None:
        dict_lst.append(dct)

In [4]:
dict_lst[0]

{'title': 'Primer of populist progressives;\n They claim subtle and complex differences from DFL liberals\n',
 'publisher': 'Star Tribune (Minneapolis, MN)',
 'year': '1993',
 'month': '02',
 'day': '28',
 'full text': 'Lots of Minnesota\'s liberal DFLers these days seem to be calling themselves "progressives" or even "populist progressives."\n In the St. Paul mayoral election, for instance, Mayor Jim Scheibel and U.S. Sen. Paul Wellstone have launched a well-publicized effort to elect not just any Democrat, but a candidate with a "progressive agenda."\n Up the hill in the state House, there\'s a growing subcaucus called the Progressive Study Group, which meets regularly and elects officers.\n The use of the term "progressive" has been growing for at least a decade. But many Minnesotans probably wonder: What is the difference between these progressives and your garden-variety liberal Democrat?\nConservatives and Republicans say not much, except that progressives are more extreme. Their

In [5]:
import csv
output_file = "all_years_raw.csv"

# Write the dictionary to the CSV file
with open(output_file, "w", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=dict_lst[0].keys())
    writer.writeheader()
    for dct in dict_lst:
        writer.writerow(dct)
    file.close()

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv(output_file, encoding="latin-1")
df["full text"][0]

'Lots of Minnesota\'s liberal DFLers these days seem to be calling themselves "progressives" or even "populist progressives."\n In the St. Paul mayoral election, for instance, Mayor Jim Scheibel and U.S. Sen. Paul Wellstone have launched a well-publicized effort to elect not just any Democrat, but a candidate with a "progressive agenda."\n Up the hill in the state House, there\'s a growing subcaucus called the Progressive Study Group, which meets regularly and elects officers.\n The use of the term "progressive" has been growing for at least a decade. But many Minnesotans probably wonder: What is the difference between these progressives and your garden-variety liberal Democrat?\nConservatives and Republicans say not much, except that progressives are more extreme. Their critics say "progressive" is nothing more than a euphemism for "radical" or "ultra-liberal."\n But progressives themselves say the difference between themselves and liberals is subtle, complex and important. They also 

In [12]:
# some statistics to get an idea of our raw data
# df.count() # 43165 rows returned
# (df["title"] =="TITLE NOT FOUND").sum() # 1311 titles not found
# (df["publisher"] =="PUBLISHER NOT FOUND").sum() # 1311 publishers not found
# (df["year"] ==0000).sum() # 4 articles where date was not found (year, month, or date)


4

In [None]:
df.sample(n=50)