In [1]:
import os, stat
import re
from datetime import datetime
import chardet
import textract
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
data_path = '../data/t5/'
data_folder = os.listdir(data_path)

In [3]:
files = {}

for ext in {re.search('(\.\w+)', file).group(0) for file in data_folder if (file != '.DS_Store')}:
    filenames = [f for f in data_folder if f.endswith(ext)]
    files[ext] = {
        'count': len(filenames),
        'filenames': filenames
    }

In [4]:
[(key, files[key]['count']) for key in files.keys()]

[('.text', 711),
 ('.xls', 250),
 ('.html', 1093),
 ('.jpg', 362),
 ('.gif', 67),
 ('.doc', 533),
 ('.ppt', 368),
 ('.pdf', 1073)]

In [5]:
images = {key: files[key] for key in files.keys() if (key in ('.jpg', '.gif'))}

text = {key: files[key] for key in files.keys() if (key not in ('.jpg', '.gif', '.ppt'))}

# Text Extraction — Method Assignment

*Images will be ignored on this first iteration.*

## Text files

| Extension | textract | Python | BeautifulSoup |
|:-:|:-:|:-:|:-:|
|.xls| x | | |
|.pdf| x | | |
|.ppt| |||
|.doc| x | | |
|.html| | | x |
|.text| | x | |

### .xls files

Extraction would be partial.

In [6]:
ext_df = []
for ext in text.keys():
    ext_df_obj = pd.DataFrame({'folder': ['../data/t5/'] * text[ext]['count'], 'extension': [ext]*text[ext]['count'], 'filename': text[ext]['filenames']})
    ext_df.append(ext_df_obj)
    
text_df = pd.concat(ext_df)

In [7]:
text_df['absolute_path'] = text_df['folder'] + text_df['filename']
text_df['size_mb'] = text_df['absolute_path'].apply(lambda row: os.stat(row).st_size / 1e+6)
text_df['created_on'] = text_df['absolute_path'].apply(lambda row: datetime.fromtimestamp(os.stat(row).st_birthtime))
text_df['last_modified_on'] = text_df['absolute_path'].apply(lambda row: datetime.fromtimestamp(os.stat(row).st_mtime))
text_df['unix_permission'] = text_df['absolute_path'].apply(lambda row: stat.filemode(os.stat(row).st_mode))

#### Sorted by size

In [8]:
text_df.sort_values(by='size_mb', ascending=False)

Unnamed: 0,folder,extension,filename,absolute_path,size_mb,created_on,last_modified_on,unix_permission
11,../data/t5/,.xls,000043.xls,../data/t5/000043.xls,14.646272,2011-02-08 15:37:48,2011-02-08 15:37:48,-rw-r--r--
2,../data/t5/,.xls,000034.xls,../data/t5/000034.xls,14.257664,2011-02-08 15:37:46,2011-02-08 15:37:46,-rw-r--r--
9,../data/t5/,.doc,000113.doc,../data/t5/000113.doc,14.170112,2011-02-08 15:37:54,2011-02-08 15:37:54,-rw-r--r--
23,../data/t5/,.xls,000055.xls,../data/t5/000055.xls,14.073344,2011-02-08 15:37:50,2011-02-08 15:37:50,-rw-r--r--
218,../data/t5/,.xls,003464.xls,../data/t5/003464.xls,13.533184,2011-02-08 15:53:16,2011-02-08 15:53:16,-rw-r--r--
...,...,...,...,...,...,...,...,...
925,../data/t5/,.html,004195.html,../data/t5/004195.html,0.004110,2011-02-08 15:53:52,2011-02-08 15:53:52,-rw-r--r--
291,../data/t5/,.html,001532.html,../data/t5/001532.html,0.004109,2009-03-05 16:29:44,2009-03-05 16:29:44,-rw-r--r--
383,../data/t5/,.text,002573.text,../data/t5/002573.text,0.004049,2009-03-05 15:59:06,2009-03-05 15:59:06,-rw-r--r--
179,../data/t5/,.text,001353.text,../data/t5/001353.text,0.004047,2009-03-05 16:29:32,2009-03-05 16:29:32,-rw-r--r--


---

# HELPERS

In [9]:
def process_file(row):
    filepath = row['absolute_path']
    ext = row['extension']
    
    return filepath, ext

In [34]:
def extract_raw_text(row, thresh=50000):
    filepath, ext = process_file(row)
    enc = chardet.detect(open(filepath, 'rb').readline())['encoding']
    text = ''
    
    try:
        if ext in ('.xls', '.pdf', '.doc'):
            text = textract.process(filepath, encoding=enc).decode()
        elif ext == '.html':
            text = BeautifulSoup(''.join(open(filepath, 'r', encoding=enc).readlines()).lower(), 'html.parser').get_text(separator=' ')
        elif ext == '.text':
            text = ' '.join(open(filepath, 'r', encoding=enc).readlines())
        else:
            pass
    except:
        pass
    
    if len(text) > thresh:
        text = text[:thresh]
        
    return text

---

In [11]:
text_df['raw_text'] = text_df.apply(lambda row: extract_raw_text(row), axis=1)



## Did the algorithm failed to extract text?

In [36]:
text_df[text_df.raw_text == ''].index

Int64Index([   0,   13,   15,   18,   25,   28,   29,   35,   37,   38,
            ...
             861,  869,  876,  966,  982,  997, 1002, 1049, 1050, 1057],
           dtype='int64', length=411)

In [32]:
raw_text_col = list(text_df.columns).index('raw_text')

In [37]:
for idx in text_df[text_df.raw_text == ''].index:
    text_df.iloc[idx, raw_text_col] = extract_raw_text(text_df.iloc[idx])

In [44]:
text_df.to_csv('../data/text_df.csv', index=False)

In [41]:
input().split(' ')

hola


['hola']