In [2]:
from nltk.stem import SnowballStemmer
import spacy
import string
import re
from tika import parser
import os
import re
from datetime import datetime
from bs4 import BeautifulSoup
import chardet
import textract
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter, defaultdict
from sklearn.utils.extmath import randomized_svd
import pickle
import os
import pymongo
from dotenv import load_dotenv
from datetime import datetime

nlp = spacy.load('en_core_web_lg')
punctuation = '!"$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
custom_sw = pickle.load(open('../app/data/stopwords.pkl', 'rb'))
custom_sw.extend(['want'])
formats = pickle.load(open('../app/data/formats.pkl', 'rb'))

In [163]:
def get_attributes(filepath, folder):
    if not folder.endswith('/'):
        folder = folder + '/'

    file = folder + filepath
    file_stats = os.stat(file)
    attributes = {'parent_folder': folder,
                  'absolute_path': file,
                  'name': filepath,
                  'size_mb': file_stats.st_size / 1e+6,
                  'created_on': datetime.fromtimestamp(file_stats.st_birthtime),
                  'last_modified_on': datetime.fromtimestamp(file_stats.st_mtime)}

    if os.path.isdir(file):
        ext = 'folder'
    elif os.path.isfile(file):
        ext = filepath.split('.')[-1]
    else:
        ext = ''

    attributes['extension'] = ext

    return attributes


def blind_categories(format_dict, files):
    blind_classifier = {}

    for k, v in format_dict.items():
        blind_classifier[k] = [e for e in files if (e['extension'] in v)]

    blind_classifier['folders'] = [
        e for e in files if (e['extension'] == 'folder')]
    return blind_classifier


def extract_text(record, thresh=50000):
    text_ext = ['ppt', 'pptx', 'doc', 'docx', 'odt', 'pdf',
                'rtf', 'text', 'txt', 'wpd', 'ods']

    if record['extension'] in text_ext:
        text = parser.from_file(record['absolute_path'], service='text')[
            'content']
    
    text = text.replace('\n', ' ')
    text = re.sub('[#]\w+', ' ', text)
    text = re.sub('\d+', '', text)
    for p in punctuation:
        text = text.replace(p, ' ')
        
    if (len(text) < thresh):
        return text
    else:
        return text[:thresh]


def labels(corpus, nclusters=10):
    params = {
        'vectorizer': {
            'analyzer': 'word',
            'stop_words': stopwords.words('english'),
            'ngram_range': (1, 1),
            'token_pattern': '[a-z]{3,}',
            'max_df': 0.1,
            'lowercase': True
        },
        'km_params': {
            'random_state': 30,
            'n_clusters': nclusters,
        }
    }

    tfidf = TfidfVectorizer(**params['vectorizer'])
    try:
        X = tfidf.fit_transform(corpus)
        km = KMeans(**params['km_params']).fit(X)
        return km.labels_
    except:
        return 404


def name_cluster_labels_common(sw, topic_files, words=8):

    label_text = ' '.join([file['preproc_text'] for file in topic_files])
    corpus = re.findall('[a-zA-Z]{4,}', label_text)
    corpus = [w for w in corpus if (w.lower() not in sw)]
    label_dict = Counter(corpus)
    cluster_name = '-'.join([w[0] for w in label_dict.most_common(words)])

    return cluster_name


def get_topic_files(files):
    topic_files = defaultdict(list)

    for file in files:
        topic_files[file['topic']].append(file)

    topic_files = dict(topic_files)

    return topic_files



def preprocessing(text):
    text = text.replace('\n', ' ')
    text = re.sub('[#]\w+', ' ', text)
    text = re.sub('\d+', '', text)
    for p in punctuation:
        text = text.replace(p, ' ')
    
    sent = []
    doc = nlp(text)
    
    for word in doc:
        sent.append(word.lemma_)
    
    return " ".join(sent).replace('-PRON-', '')


def stemming(text, all_stopwords=custom_sw):
    stemmer = SnowballStemmer('english', ignore_stopwords=True)
    stemmer.stopwords = all_stopwords
    
    stemmed_words = []
    
    for w in text.split():
        stemmed_words.append(stemmer.stem(w))
        
    text = ' '.join(stemmed_words)
        
    return text

# Access database

In [154]:
project_folder = os.path.expanduser(
    '/Users/faustina/METIS/BOOTCAMPWORK/Project5/controlledchaos/data/')  # the folder of your project
load_dotenv(os.path.join(project_folder, '.env'))

MONGODBNAME = os.getenv("MONGODBNAME")

client = pymongo.MongoClient()
db = client[MONGODBNAME]
curr_files_db = db['curr_files']

In [155]:
folder = '../data/test_case1/'
test_case = os.listdir(folder)

# Load files

In [156]:
curr_files_db.delete_many({})
curr_files_db.insert_many([get_attributes(file, folder) for file in test_case])
cursor = curr_files_db.find({})
entries = list(cursor)

# Categorization

In [157]:
files_w_attributes = entries

blind_cats = {k: list(curr_files_db.find({"extension": {"$in": v}})) for k, v in formats.items()}
blind_cats = {item: blind_cats[item] for item in sorted(blind_cats, key=lambda i: len(blind_cats[i]), reverse=True)}

In [158]:
def blind_categories(format_dict, collection):
    blind_classifier = {k: list(collection.find(
        {"extension": {"$in": v}})) for k, v in format_dict.items()}

    for k, v in blind_classifier.items():
        [(file.pop('_id', None) and file.pop('text', None)) for file in v]

    blind_classifier = {item: blind_classifier[item] for item in sorted(
        blind_classifier, key=lambda i: len(blind_classifier[i]), reverse=True)}
    return blind_classifier

# Text

1. Extraction
2. Preprocessing

    1. Remove punctuation marks
    2. Replace newline and tab characters with common spaces
    3. Lemmatize

3. Stemming (Snowball)
    1. Preserve stopwords
    2. Stem non-stopword words
    
4. Build corpus with lemmatized text
5. Convert corpus to a matrix of TF-IDF features
    1. Unigrams
    2. Words with more than 3 characters
    3. Terms present in at least 2 documents
    4. Terms present in at most 40% of the files analyzed

In [159]:
text_ext = ['ppt', 'pptx', 'doc', 'docx', 'odt', 'pdf',
                'rtf', 'text', 'txt', 'wpd', 'ods']

In [165]:
for file in curr_files_db.find({'extension': {"$in": text_ext}}):
    file_text = extract_text(file)
    if file_text:
        curr_files_db.update_one({'_id': file['_id']}, {"$set": {"text": file_text}})

In [162]:
file

{'_id': ObjectId('5f5fa71af48baabc28c6c7ba'),
 'parent_folder': '../data/test_case1/',
 'absolute_path': '../data/test_case1/000464.text',
 'name': '000464.text',
 'size_mb': 0.094528,
 'created_on': datetime.datetime(2011, 2, 8, 15, 38, 16),
 'last_modified_on': datetime.datetime(2011, 2, 8, 15, 38, 16),
 'extension': 'text',
 'text': None}

In [166]:
for file in curr_files_db.find({'extension': {"$in": text_ext}}):
    curr_files_db.update_one({'_id': file['_id'], "text": {"$exists": True}}, {"$set": {"preproc_text": preprocessing(file['text'])}})

In [167]:
for file in curr_files_db.find({'extension': {"$in": text_ext}}):
    curr_files_db.update_one({'_id': file['_id'], "preproc_text": {"$exists": True}}, {"$set": {"lemma_text": stemming(file['preproc_text'])}})

In [168]:
corpus = [file['lemma_text'] for file in list(curr_files_db.find({'extension': {"$in": text_ext}}))]

In [189]:
params = {
        'vectorizer': {
            'analyzer': 'word',
            'stop_words': stopwords.words('english'),
            'ngram_range': (1, 2),
            'token_pattern': '[a-z]{3,}',
            'min_df': 2,
            'max_df': 10,
            'lowercase': True
        }
    }

tfidf = TfidfVectorizer(**params['vectorizer'])

In [190]:
X_tfidf = tfidf.fit_transform(corpus)

In [199]:
def labels(corpus, vectorizer=tfidf, nclusters=10):
    try:
        X = tfidf.fit_transform(corpus)
        km = KMeans(n_clusters=nclusters).fit(X)
        return km.labels_
    except:
        return 404

# SVD

1. Calculate matrix decomposition to get sigma values
2. Calculate minimum optimal k to produce clusters

In [191]:
k = 100
U, Sigma, VT = randomized_svd(X_tfidf.toarray(), 
                              n_components=k,
                              n_iter=5,
                              random_state=None)

In [192]:
opt_k = 0

for idx, sig in enumerate(Sigma):
    if (idx + 2) < len(Sigma):
        #print(sig, Sigma[idx + 1])
        if (str(sig)[:4] == str(Sigma[idx + 1])[:4] == str(Sigma[idx + 2])[:4]):
            print(sig, Sigma[idx + 1])
            print(idx)
            opt_k = idx
            break

1.0388972767079476 1.0352513808115358
15


# Clustering

In [193]:
entries = list(curr_files_db.find({"text": {"$exists": True}}))
raw_docs = [file['text'] for file in entries]
doc_labels = labels(raw_docs, nclusters=opt_k)

# Update database with labels

In [194]:
for idx, entry in enumerate(entries):
    curr_files_db.update_one({'_id': entry['_id']}, {"$set": {"label": 'topic{}'.format(doc_labels[idx])}})

# Agreggation

In [195]:
topic_files = {}

for i in range(opt_k):
    entries = list(curr_files_db.find({"label": {"$exists": True}, "label": 'topic{}'.format(i)}))
    if len(entries):
        topic_name = name_cluster_labels_common(custom_sw, entries, words=5)
        topic_files[topic_name] = entries
    else:
        break

In [196]:
for k, v in topic_files.items():
    [[file.pop(file_key, None) for file_key in ['_id', 'text', 'preproc_text', 'lemma_text']] for file in v]

In [197]:
topic_files

{'DIRS-system-event-sequence-section': [{'parent_folder': '../data/test_case1/',
   'absolute_path': '../data/test_case1/000487.text',
   'name': '000487.text',
   'size_mb': 0.030667,
   'created_on': datetime.datetime(2011, 2, 8, 15, 38, 18),
   'last_modified_on': datetime.datetime(2011, 2, 8, 15, 38, 18),
   'extension': 'text',
   'label': 'topic0'},
  {'parent_folder': '../data/test_case1/',
   'absolute_path': '../data/test_case1/001751.text',
   'name': '001751.text',
   'size_mb': 0.061148,
   'created_on': datetime.datetime(2009, 3, 5, 16, 29, 34),
   'last_modified_on': datetime.datetime(2009, 3, 5, 16, 29, 34),
   'extension': 'text',
   'label': 'topic0'},
  {'parent_folder': '../data/test_case1/',
   'absolute_path': '../data/test_case1/002087.text',
   'name': '002087.text',
   'size_mb': 0.230421,
   'created_on': datetime.datetime(2009, 3, 5, 15, 59, 6),
   'last_modified_on': datetime.datetime(2009, 3, 5, 15, 59, 6),
   'extension': 'text',
   'label': 'topic0'},
  {'

In [198]:
pickle.dump(topic_files, open('../app/data/test1.pkl', 'wb'))