# Open files in folder, process, save output files


## Initialisation
Load required modules

In [98]:
# initialisation

import pandas as pd
pd.__version__

'0.24.2'

In [99]:
import re
re.__version__

'2.2.1'

In [100]:
import csv
csv.__version__

'1.0'

## Define functions

In [101]:
# define the transformation function for the mapping.

def process_body(body_input):
    '''
    we want to do multiple cleansing activities on the data
    then return all the data
    '''
    result = body_input
    
    # remove all tags
    result = re.sub("<[\s\S\n]+?>","",result)

    # remove line breaks
    result = re.sub("\n"," ",result)
    result = re.sub("\r"," ",result)
    
    # remove numbers
    result = re.sub("\d","",result)
    
    # replace tab with a space
    result = re.sub("\t"," ",result)
    
    # replace encoded html characters
    # replace nbsp with space
    result = result.replace("&nbsp;"," ")
    
    # replace ampersand with 'and'
    result = result.replace("&amp;","and")
    
    # replace encoded fancy quotes with space
    result = result.replace("”"," ")
    result = result.replace("“"," ")
    result = result.replace("‘"," ")
    
    # replace encoded quotes with space
    result = result.replace("&rsquo;"," ")
    result = result.replace("&lsquo;"," ")
    result = result.replace("&rdquo;"," ")
    result = result.replace("&ldquo;"," ")
    
    # replace bullet with space
    result = result.replace("•"," ")
    
    # replace £ sign with space
    result = result.replace("£"," ")
    
    # replace % sign with 'percent'
    result = result.replace("%","percent")
    
    ## old stories may have other umlauts and accented characters in, 
    ## but the new stories don't seem to have this
    ## so I shall ignore the characters for now
    
    # remove any double quotes, they mess with the csv format
    result = result.replace('"','')
    
    
    # make everything lower case
    result = result.lower()
    
    # return the cleaned result
    return result

In [102]:
# write out to csv
# make a DataFrame with the clean body column, not the old one
def writefile(df, filename):
    '''
    Write the file to disk using 
        dataframe df
        filename 
    '''
    df_out = pd.DataFrame(
        {
            "Body":df.bodyclean,
            "Category":df.Category
        }
    )

    # write out
    df_out.Category = df.Category.astype(str)
    df_out.to_csv('./'+filename+'_out.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [103]:
def processfile(filename):
    '''
    For the provided filename, calls functions to 
        open the file
        process the file
        save the file        
    '''
    df = pd.DataFrame
    df = pd.read_csv("./"+filename+".csv")
    df['bodyclean'] = df.Body.map(lambda x:process_body(x))
    writefile(df, filename)

## Set up instance variables
ie, the list of files to process

In [104]:
# fileList = ["news20190320","news_uncategorised"]
fileList = [
    "news_bodytext_20191002_prep"
    ,"news_unclassified_prep",
    "news20190320_prep"]

for file in fileList:
    print (file)

news_bodytext_20191002_prep
news_unclassified_prep
news20190320_prep


## Process the data

In [105]:
for file in fileList:
    processfile(file)