In [1]:
import pandas as pd
import re
import unicodedata
import glob
from pathlib import Path


## Create functions for load raw csv as dataframes

In [2]:
class LoadRawDF:
    def __init__(self):
        
        # Function to create raw dataframes

        self.a = 0
        self.pattern_name = re.compile("[A-Z]+[a-z]+")

    def read_csv(self, filename):
        df = pd.read_csv(filename, sep='delimiter', header=None)
        return df

    def set_column_name(self, df, colname):
        df.columns = [colname]
        return df

    def extract_categories(self, df):
        unique = df.data.unique()
        categories = []
        for i in range(len(unique)):
            if ("►" in unique[i]):
                categories.append(unique[i])
            else:
                pass
        return categories

    def _label_race(self, row, categories):
        if row['data'] in categories:
            self.a +=1
            if self.a == 0:
                pass
            elif self.a >1:
                self.a = 0
            else:
                pass
            return self.a
        else:
            return -1

    def create_tag_column(self, df, categories):
        df['tag'] = df.apply (lambda row: self._label_race(row, categories), axis=1)
        return df

    def create_dataframe_set(self, df):
        a = pd.factorize(df['tag'].isin([0,1]).iloc[::-1].cumsum().sort_index())[0]
        dfs = dict(tuple(df.groupby(a)))
        return dfs

    def create_dfs_by_cat(self, dfs, categories):
        new_cats = []
        for i in range(len(dfs)):
            key = dfs[i]["data"].tolist()[-1]
            if key in categories:
                n_c = dict()
                n_c[key] = dfs[i+1]
            else:
                pass
            new_cats.append(n_c)

        return new_cats

    def create_csv(self, new_cats):
        for i, d in enumerate(new_cats):
            for k, v in d.items():
                n_df = pd.DataFrame(v)
                
                if ("/" in k):
                    new_name = k.replace("/","_")
                else:
                    new_name = k

                export_csv = n_df.to_csv (f"data/{new_name}.csv", index = None, header=True) #Don't forget to add '.csv' at the end of the path 
    
    def create_raw_dfs(self, new_cats):
        raw_df=[]
        for i, d in enumerate(new_cats):
            for k, v in d.items():
                n_df = pd.DataFrame(v)
                raw_df.append(n_df)
    

    def remove_accent(self, text):
        
        output = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
        decoded = output.decode('utf-8')
        return decoded
    
    def get_name(self, filename):
        result = self.pattern_name.findall(filename)
        name = result[-1]
        return result
    
    def __call__(self, fileName):
        """
        Return a list of dfs
        """
        output_dfs_list = None
        
        # Load dataframe
        df = self.read_csv(fileName)
        
        # Create column name by using the filename
        
        # Change column name
        column_name = "data"
        df = self.set_column_name(df, column_name)
        
        # Exctrac all the catogories as list
        categories = self.extract_categories(df)
        
        if len(categories) > 0:
            
            # Create Tag column with if -1 is general, 0 if is category,
            # 1 next category and 0 again for next cat.
            df = self.create_tag_column(df, categories)

            # Create dict of dictionaries by categorie where Key is the index 0 or 1 for categories
            dfs = self.create_dataframe_set(df)

            # Replace the keys (0 or 1) by his real value in categoire list.
            new_cats = self.create_dfs_by_cat(dfs, categories)
        
            return new_cats
        else :
            return pd.DataFrame(columns= df.columns.tolist())
        
df_raw_loader = LoadRawDF()



## Functions for clean the data and save as csv

In [3]:
# Functions for clean the data

class CleanerDF:
    
    def __init__(self):

        self.pattern_name = re.compile("[A-Z]+[a-z]+")
        self.pattern_unused_rows = "\[+[0-9]+-[0-9]+\]"

    def _remove_accent(self, text):
        output = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
        decoded = output.decode('utf-8')
        return decoded

    def read_csv(self, file):
        df = pd.read_csv(file)
        return df

    def unused_df_rows(self, dff):
        aux_df = dff[dff['data'].astype(str).str.match(self.pattern_unused_rows)]
        return aux_df

    def get_index_to_filter(self, u_df, dff):
        """
        Return list of indices to remove
        """
        indexNames = u_df.index
        indexUnused = dff[-1:].index

        indexes = indexNames.tolist() + indexUnused.tolist()
        return indexes

    def _drop_rows(self, dff, indexes):
        # Delete these row indexes from dataFrame
        dff.drop(indexes, inplace=True)
        return dff

    def _drop_column(self, dff):
        dff = dff.drop(columns=['tag'])
        return dff

    def _rename_column(self, dff, name):
        dff = dff.rename(columns={"data": name})
        return dff

    def save_clean_df(self, dff, filename):
        dff.to_csv(f'clean_data/{filename}.csv', index = None, header=True)

    def get_name(self, filename):
        """
        Return Column name as string
        """
        result = self.pattern_name.findall(filename)[-1]
        return result


    def clean_df(self, df, fileName):

        f = self._remove_accent(fileName)
        
        column_name = self.get_name(f)

        aux_df = self.unused_df_rows(df)
        
        indexes = self.get_index_to_filter(aux_df, df)
        n_df = self._drop_rows(df, indexes)
        df = self._drop_column(n_df)
        
        
        df = self._rename_column(df, column_name)
        
        return df


cleaner = CleanerDF()



In [4]:
# Obtain all the clean dataframes as a list

def create_clean_df_as_list(raw_dfs_list):
    dfs = []
    for i, df in enumerate(raw_dfs_list):
        for k, v in df.items():
            df = cleaner.clean_df(v, k)
            dfs.append(df)
    return dfs

#dfs = create_clean_df_as_list(raw_dfs_list)

## Create unique dataframe

In [5]:
def create_joint_dataframe(dfs, date):
    """
    Return the concatenate dataframe for all 
    """
    join_df = []

    for df in dfs:
        column_name = df.columns.tolist()[0]
        data = df[column_name].tolist()
        if (len(data) > 0 ):
            df_n = pd.DataFrame(data[1:], columns=["document"])
            df_n["date"] = date
            df_n["label_1"]= column_name
            df_n["label_2"] = data[0]
            join_df.append(df_n)
        else:
            pass

    
    new_df = pd.concat(join_df)
    new_df = new_df.reset_index()
    
    return new_df

In [7]:
def get_files_in_dir(src):

    filenames = []
    for filename in Path(src).glob("**/*.csv"):
        filenames.append(str(filename))
    return filenames

filenames = get_files_in_dir("data")

filenames[:4]


['data/2019-01-01/2019-01-01.csv',
 'data/2017-07-13/2017-07-13.csv',
 'data/2017-07-21/2017-07-21.csv']

In [8]:
df_raw_loader = LoadRawDF()
cleaner = CleanerDF()

def create_huge_df(folder):
    filenames = get_files_in_dir(folder)
    all_df=[]
    for file in filenames:
        print(file)
        date = str(file).split("/")[-1].split(".")[0]       
        raw_dfs_list = df_raw_loader(file)
        try:
            dfs = create_clean_df_as_list(raw_dfs_list)
            new_df = create_joint_dataframe(dfs,  date)
            all_df.append(new_df)
        except:
            print("THIS ", file)

        
    return all_df


all_df_as_list = create_huge_df("../data_raw")



../data_raw/2018-10-02/2018-10-02.csv


  # Remove the CWD from sys.path while we load stuff.


../data_raw/2018-04-18/2018-04-18.csv
../data_raw/2018-05-11/2018-05-11.csv
../data_raw/2018-06-02/2018-06-02.csv
../data_raw/2017-12-13/2017-12-13.csv
../data_raw/2017-07-04/2017-07-04.csv
THIS  ../data_raw/2017-07-04/2017-07-04.csv
../data_raw/2018-05-04/2018-05-04.csv
../data_raw/2018-02-04/2018-02-04.csv
../data_raw/2019-02-28/2019-02-28.csv
../data_raw/2019-03-11/2019-03-11.csv
../data_raw/2017-09-16/2017-09-16.csv
../data_raw/2017-08-30/2017-08-30.csv
../data_raw/2017-11-30/2017-11-30.csv
../data_raw/2018-10-06/2018-10-06.csv
../data_raw/2019-01-18/2019-01-18.csv
../data_raw/2018-02-23/2018-02-23.csv
../data_raw/2018-12-18/2018-12-18.csv
../data_raw/2018-10-15/2018-10-15.csv
../data_raw/2018-11-11/2018-11-11.csv
../data_raw/2017-07-27/2017-07-27.csv
../data_raw/2018-04-16/2018-04-16.csv
../data_raw/2019-07-07/2019-07-07.csv
../data_raw/2019-02-14/2019-02-14.csv
../data_raw/2018-12-04/2018-12-04.csv
../data_raw/2018-03-05/2018-03-05.csv
../data_raw/2017-11-11/2017-11-11.csv
../dat

../data_raw/2017-07-10/2017-07-10.csv
THIS  ../data_raw/2017-07-10/2017-07-10.csv
../data_raw/2019-07-29/2019-07-29.csv
THIS  ../data_raw/2019-07-29/2019-07-29.csv
../data_raw/2018-05-24/2018-05-24.csv
../data_raw/2018-10-01/2018-10-01.csv
../data_raw/2018-04-31/2018-04-31.csv
../data_raw/2019-01-19/2019-01-19.csv
../data_raw/2019-07-05/2019-07-05.csv
../data_raw/2019-03-07/2019-03-07.csv
../data_raw/2019-02-22/2019-02-22.csv
../data_raw/2018-01-06/2018-01-06.csv
../data_raw/2019-04-14/2019-04-14.csv
THIS  ../data_raw/2019-04-14/2019-04-14.csv
../data_raw/2017-07-02/2017-07-02.csv
THIS  ../data_raw/2017-07-02/2017-07-02.csv
../data_raw/2019-05-12/2019-05-12.csv
THIS  ../data_raw/2019-05-12/2019-05-12.csv
../data_raw/2018-03-20/2018-03-20.csv
../data_raw/2017-07-11/2017-07-11.csv
THIS  ../data_raw/2017-07-11/2017-07-11.csv
../data_raw/2018-10-26/2018-10-26.csv
../data_raw/2018-10-23/2018-10-23.csv
../data_raw/2019-02-25/2019-02-25.csv
../data_raw/2018-10-27/2018-10-27.csv
../data_raw/20

../data_raw/2019-05-14/2019-05-14.csv
../data_raw/2018-12-10/2018-12-10.csv
../data_raw/2019-04-19/2019-04-19.csv
../data_raw/2019-07-21/2019-07-21.csv
THIS  ../data_raw/2019-07-21/2019-07-21.csv
../data_raw/2017-08-16/2017-08-16.csv
../data_raw/2017-07-24/2017-07-24.csv
../data_raw/2018-03-10/2018-03-10.csv
../data_raw/2018-05-31/2018-05-31.csv
../data_raw/2018-11-23/2018-11-23.csv
../data_raw/2019-01-10/2019-01-10.csv
../data_raw/2019-01-17/2019-01-17.csv
../data_raw/2019-01-21/2019-01-21.csv
../data_raw/2017-07-20/2017-07-20.csv
../data_raw/2018-03-27/2018-03-27.csv
THIS  ../data_raw/2018-03-27/2018-03-27.csv
../data_raw/2018-01-11/2018-01-11.csv
../data_raw/2019-07-18/2019-07-18.csv
../data_raw/2018-06-08/2018-06-08.csv
../data_raw/2017-09-03/2017-09-03.csv
../data_raw/2019-04-08/2019-04-08.csv
../data_raw/2018-10-04/2018-10-04.csv
../data_raw/2019-02-10/2019-02-10.csv
../data_raw/2019-04-07/2019-04-07.csv
../data_raw/2017-11-10/2017-11-10.csv
../data_raw/2017-07-17/2017-07-17.csv


../data_raw/2017-08-26/2017-08-26.csv
../data_raw/2017-09-15/2017-09-15.csv
../data_raw/2018-01-17/2018-01-17.csv
../data_raw/2019-04-13/2019-04-13.csv
../data_raw/2018-01-25/2018-01-25.csv
../data_raw/2018-09-06/2018-09-06.csv
../data_raw/2017-10-26/2017-10-26.csv
../data_raw/2019-05-11/2019-05-11.csv
THIS  ../data_raw/2019-05-11/2019-05-11.csv
../data_raw/2018-11-18/2018-11-18.csv
../data_raw/2017-11-27/2017-11-27.csv
../data_raw/2018-05-06/2018-05-06.csv
../data_raw/2019-03-30/2019-03-30.csv
../data_raw/2017-09-23/2017-09-23.csv
../data_raw/2018-09-11/2018-09-11.csv
../data_raw/2017-11-07/2017-11-07.csv
../data_raw/2018-12-06/2018-12-06.csv
../data_raw/2018-03-13/2018-03-13.csv
../data_raw/2019-01-02/2019-01-02.csv
../data_raw/2019-05-01/2019-05-01.csv
../data_raw/2018-01-04/2018-01-04.csv
../data_raw/2019-01-07/2019-01-07.csv
../data_raw/2019-07-25/2019-07-25.csv
../data_raw/2019-02-07/2019-02-07.csv
../data_raw/2017-11-03/2017-11-03.csv
../data_raw/2019-05-05/2019-05-05.csv
../dat

In [9]:
new_df = pd.concat(all_df_as_list)
new_df = new_df.reset_index()

new_df.to_csv(f'clean_data/all_data.csv', index = None, header=True)