In [23]:
#!/usr/bin/env python
'''
    File name: Df_non_ascii_cleanup.py
    Description: Takes a csv file or a folder of csv and cleansup non_ascii characters for each
    Date created: 7/25/2018
    Date last modified: 7/26/2018
    Python Version: 3.6.
'''
__author__= "Shanjeev"
__email__ = "Shanjeev.rajendiran@motorolasolutions.com"

In [1]:
import pandas as pd
import numpy as np
import re
import sys
import os
import time
import traceback

In [2]:
def detect_non_ascii(row,regex_pattern):
    
    """ Detects non-ascii characters and return True/False for masking

        Parameters:
        param row: the entire row in dataframe
        param regex_pattern: Regular expression for non-ascii characters

        Returns: True if found, False if otherwise.
    """    
    row = row.astype('str')

    lst = re.findall(regex_pattern,''.join(row.values.tolist()))
    if len(lst)!=0:
        return(True)
    else:
        return(False)

In [None]:
def csv_finder():
    """ Function to return csv files and exit if found none.
        Returns: List of Files
    """
    if len(sys.argv) > 2:
        sys.exit('The script only accepts one argument ex <filename.csv> or <folder>') 
        
    if len(sys.argv) == 2 and os.path.isfile(sys.argv[1]):
        if sys.argv[1].endswith('.csv'):
            return list(sys.argv[1])
        else:
            sys.exit("Please enter a CSV file")
    else:
        if len(sys.argv)==2:
            folder = sys.argv[1]
        else:
            folder = os.getcwd()
            
        files = [f for f in os.listdir(folder) if f.endswith('.csv')]
        
        if len(files) !=0:
            return files
        else:
            sys.exit("No csv files in the given folder")              

In [3]:
def clean(row,regex_pattern):

    """ Looks for non-ascii characters columnwise and replaces with empty string

        Parameters:
        param row: row of the given column in dataframe
        param regex_pattern: Regular expression for non-ascii characters

        Returns: cleans up the row if non-ascii characters found or returns the original row.
    """       
    if len(re.findall(regex_pattern,row))==0:
        return(row)
    else:
        return(re.sub(regex_pattern, '', row)) 

In [4]:
def main():
    """ Takes up csv file as argument from CLI,
        reads the csv file cleans away the non ascii characters,
        records in 'non_ascii' column at the last
        and saves the final csv arrived.
    """ 
    for file in csv_finder():
        csv_file = file
        error_file = csv_file.replace('.csv',time.strftime("_%d-%m-%Y_%H-%M-%S")+'_error.csv')
        output_file = csv_file.replace('.csv','_cleaned.csv')

        regex_pattern = r'[^\x00-\x7F]'

    #Reading the csv file and renaming if erroreous file
        try:
            df = pd.read_csv(csv_file,encoding='cp1252')
        except:
            os.rename(csv_file,error_file)
            print('Error encoding the csv file -{}'.format(file))
            continue

    #Flagging the corrupted rows in "non_ascii" table      
        df['non_ascii'] = 'N'
        df.loc[df.apply(lambda x: detect_non_ascii(x,regex_pattern),axis = 1),'non_ascii'] = 'Y'

    #Finding and replacing the non ascii characters for the whole dataframe
        for col in df.columns:
            if df[col].dtype == ('O'):
                df.fillna('',inplace=True)
                df[col] = df[col].apply(lambda x: clean(str(x),regex_pattern))

    #Saving as csv file            
        df.to_csv(output_file)

In [5]:
if __name__ == '__main__':
    try:
        main()
    except:
        print(traceback.format_exc())