In [None]:
from csv import DictReader
from collections import Counter
from functools import partial
from itertools import chain
from math import isnan
from pathlib import Path
import re
import sys

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
from pandas import DataFrame, read_csv, concat
import seaborn as sns

from lang import english, clean_stems


%matplotlib notebook
%precision 4

BASEDIR = Path('../data')

In [None]:
sys.version_info

In [None]:
with open(BASEDIR / 'prs.csv') as fd:
    df = read_csv(fd, low_memory=False)
df.shape

In [None]:
labels = np.array(df.columns)

In [None]:
author_columns = ['AssigneeUserID', 'OriginatorUserID', 'ResponsibleEditorUserId', 'SignersUserID']
censored_authors = [43660., 75254., 75426., 76744., 78939., 80050., 82903., 83760.]
idx = df.applymap(lambda i: i in censored_authors).any(axis=1)
display(df[idx])
df = df[~idx]
df.head()

In [None]:
censored_anomalies = [
    55793, 39583, 39582, 22103, 8789, 5011, #PFR
    54721, 36880, 16113, 13705, 13622, 5640, 4521, 4371, 4045, 3151, 2385, 2207, #DPFR
    41963, 37011, 28842, 23095, 22818, 17127, 14871, 13982 #ISA
]
idx = df.Anomaly_ID.isin(censored_anomalies)
df = df[~idx]

In [None]:
def get_rep_num(s):
    s_sp = s.split(" :: ", 1)
    if len(s_sp) > 1:
        return s_sp
    else:
        return '', s_sp[0]


def rm_dup_pfr(pfr):
    # remove dummy test data from PRS training/dev
    pfr = pfr[pfr['Project_Name'] != 'PRS Training Project']
    pfr = pfr.drop([col for col in pfr.columns \
                    if 'PRS Training Project' in col], axis=1)

    # separate report numbers from titles that are migrated from legacy system
    pfr['rep_num'], pfr['rep_title'] = pfr['Title'].apply(lambda x: get_rep_num(x)).str

    # remove one long report number which is duplicate of an ISA report
    pfr = pfr[pfr.rep_num.str.len() < 7]

    # remove duplicates in report description 
    # Reports may be generated multiple times due to system glitches in PRS
    pfr = pfr[-pfr.Description.str.lower().duplicated()]

    # use keywords in report titles to remove shadow/companion D/PFR for ISA
    title_dup_key = ['duplicate ', 'shadow d?pfr', 'companion d?pfr']
    pfr = pfr[-pfr.rep_title.str.lower().str.contains('|'.join(title_dup_key))]

    # use keywords in report descriptions to remove duplicates
    desc_dup_key = ['coordinating pfr', 'duplicate of', 'administrative', 
               'identical to d?pfr', 'copied (?:directly )?from', 
                'direct copy', 'inherited from', 'isa[\w\s\-#]*\d{4,5}', 
               'copy from (?:smap )?pfr', 'created twice', 'companion d?pfr', 
               'pfr[\w\s]*carried over from', ]
    pfr = pfr[-pfr.Description.str.lower()\
              .str.contains('|'.join(desc_dup_key), na=True)]

    pfr['Title'] = pfr['rep_title']
    pfr = pfr.drop(columns=['rep_title', 'rep_num'])
    return pfr



def rm_dup_isa(isa):
    # remove dummy test data from PRS 2.0 dev team
    dev_team = [43660,75254,75426,76744,78939,80050,82903,83760]
    isa = isa[-isa.OriginatorUserID.isin(dev_team)]
    isa = isa[-isa.Project_Name.isin(['DHahn Project', 'DHahn1'])]

    # separate report numbers from titles that are migrated from legacy system
    isa['rep_num'], isa['rep_title'] = isa['Title'].apply(lambda x: get_rep_num(x)).str

    # remove duplicates in report description 
    # Reports may be generated multiple times due to system glitches in PRS
    isa = isa[-isa.Description.str.lower().duplicated()]

    # use keywords in report titles to remove duplicates
    title_dup_key = ['pfr', 'accidental duplicate']
    isa = isa[-isa.rep_title.str.lower().str.contains('|'.join(title_dup_key))]

    # remove test ISA reports
    isa = isa[-(isa.rep_title.str.lower().str.contains('test') & 
                (isa.Description.str.len() < 30))]

    # originally opened as pfr, duplicates, etc
    isa = isa[-(isa.Description.notnull() & 
          isa.Description.str.lower().str.contains('administrative'))]

    isa['Title'] = isa['rep_title']
    isa = isa.drop(columns=['rep_title', 'rep_num'])
    return isa

In [None]:
rm_dup = {'PFR':rm_dup_pfr, 'DPFR':rm_dup_pfr, 'ISA':rm_dup_isa}

In [None]:
report_types = df.ReportType.unique()
documents_by_type = \
    {t: rm_dup[t](df.loc[df.ReportType==t].dropna(axis=1, how='all'))
     for t in report_types}

In [None]:
for t in documents_by_type:
    documents_by_type[t].to_csv(f'{t}.csv')

In [None]:
labels_by_type = \
    {t:np.array(documents_by_type[t].columns)
     for t in report_types}

In [None]:
documents_by_type['ISA']

In [None]:
report_types

In [None]:
# find free fields

def get_label_diversity(df):
    
    diversity = np.array(
        [df[l].nunique() / df[l].dropna().count() 
         for l in df.columns]
    )
    return diversity

def get_label_population(df):
    rows, cols = df.shape
    population = np.array(df.count()/rows)
    return population

diverse_thresh = .4
population_thresh = .1
suspects_by_type = dict()

for t in report_types:
    labels = labels_by_type[t]
    documents = documents_by_type[t]
    rows, cols = documents.shape    

    label_diversity = get_label_diversity(documents)
    ddx = (label_diversity > diverse_thresh) #& (label_diversity < 1.0)

    label_population = get_label_population(documents)
    pdx = label_population > population_thresh

    idx = ddx & pdx

    suspects_by_type[t] = labels[idx]
    print(f'{t} ({rows}) [label, population, diversity]:', *zip(
        labels[idx],
        np.round(label_population[idx], 3),
        np.round(label_diversity[idx], 3)
    ), sep='\n  ')


In [None]:
for t in report_types:
    display(documents_by_type[t][suspects_by_type[t]].dropna().head(n=1))

In [None]:
censored = \
    'Anomaly_ID', 'CountID', 'DateClosed', 'LastProcessed', \
    'OriginationDate', 'ProblemFailureDate', 'FlightSWVersion', \
    'Procedure'

targets_by_type = dict()

for t in report_types:
    suspects = suspects_by_type[t]
    targets = suspects[~np.isin(suspects, censored)]
    targets_by_type[t] = targets
    display(documents_by_type[t][targets].dropna().head(n=1))

In [None]:
for t in documents_by_type:
    _ = documents_by_type[t][targets_by_type[t]]
    display(f'-- {t} --')
    display(
        _.applymap(lambda x: len(x.split()) 
                   if type(x) is str else float('nan')).max())


In [None]:
targets_by_type

In [None]:
DOCPRE = 'GLOMPRE'
DOCPOST = 'GLOMPOST'
import string

filters = list(read_csv(BASEDIR / 'html_escape_characters.csv').escape_char) + [
        r'[\s]',
        r'=[^\s]*',
        r'(\d+-\d+t)?\d+:\d+:\d+(.\d+)?',
        r'<.+?>',
        r'\d+\.\d+\.\d+',
        r'(\()?\d+/\d+/\d+(\))?',
        r'\s\d+\s',
        r'\*',
        r'\d\d\d\d-\d\d-\d\d',
        r'[\(\),\'\"\.:]',
        r'\s.\s',
        r'\[from.+?\]',
        ',|:|;|\(|\)|\[|\]|{|}|<|>|"|=|\?|/|@',
        r' \d+ ',
        r' - ',
        r' \| ',
    ]

In [None]:
#for f in filters:
#    print(f)

In [None]:
def regprocess(s):
    s = ' ' + s.lower() + ' '
    
    for f in filters:
        s = partial(re.sub, f, ' ')(s)
    return s


for t in documents_by_type:
    documents_by_type[t][DOCPRE] = \
        documents_by_type[t][targets_by_type[t]].applymap(str).apply(list, axis=1).apply(' '.join)
    documents_by_type[t][DOCPOST] = documents_by_type[t][DOCPRE].apply(regprocess)

In [None]:
filters

In [None]:
re.sub('\s.\s', ' ', ' + sam + am + ')

In [None]:
documents_by_type[t].GLOMPRE.iloc[7]

In [None]:
documents_by_type[t].GLOMPOST.iloc[7]

In [None]:
from reverstem import PorterStemmer

PorterStemmer.mro()

In [None]:
from operator import attrgetter
from collections import Counter

stemmer = PorterStemmer()


def tostems(doc):
    return ' '.join(
        [x.unit for x in map(stemmer.stem, doc.split()) if x]
    )


for t in documents_by_type:
    documents_by_type[t]['GLOMSTEM'] = documents_by_type[t].GLOMPOST.apply(tostems)


lookup = lambda key: stemmer._lookup[key].most_common()[0][0]
def unstem(doc):
    return ' '.join(
        [x for x in map(lookup, doc.split()) if x]
    )

for t in documents_by_type:
    documents_by_type[t]['GLOMUNSTEM'] = documents_by_type[t].GLOMSTEM.apply(unstem)



In [None]:
documents_by_type[t].GLOMUNSTEM.iloc[7]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

v = TfidfVectorizer()
x = v.fit(concat(documents_by_type.values()).GLOMSTEM)

In [None]:
stemmer._lookup['follow']

In [None]:
stems = np.array(x.get_feature_names())[x.idf_.argsort()]

In [None]:
stems.shape

In [None]:
#stemmer._lookup[stems[:100]]

In [None]:
'''stems = sorted(
    stemmer.rare_stems(10),
    key = lambda stem: sum(stemmer._lookup[stem].values()),
    reverse=True
)

'''

words = [
    stemmer._lookup[s].most_common(1)[0][0]
    for s in stems
    if stemmer._lookup[s].most_common(1)
]
len(words), len(stems), len(stemmer._lookup)

In [None]:
#raise "pause"

In [None]:
from reverstem import stopwords as _old_stopwords


In [None]:
top = 160


print(top / len(words))
tump = set(words[:top]).difference(_old_stopwords)
tump

In [None]:
'yourselves' in stems

In [None]:
offset = sum(bool(stemmer._lookup[s]) for s in stems[:top])
from collections import ChainMap



In [None]:
sorted(ChainMap(*(stemmer._lookup[s] for s in tump)).keys())

In [None]:
#raise "pause"

In [None]:
'''
nltkwords = ["i", "me", "my", "myself", "we", "our", "ours", 
"ourselves", "you", "your", "yours", "yourself", "yourselves", 
"he", "him", "his", "himself", "she", "her", "hers", "herself", 
"it", "its", "itself", "they", "them", "their", "theirs", 
"themselves", "what", "which", "who", "whom", "this", "that", 
"these", "those", "am", "is", "are", "was", "were", "be", "been", 
"being", "have", "has", "had", "having", "do", "does", "did", 
"doing", "a", "an", "the", "and", "but", "if", "or", "because", 
"as", "until", "while", "of", "at", "by", "for", "with", "about", 
"against", "between", "into", "through", "during", "before", 
"after", "above", "below", "to", "from", "up", "down", "in", 
"out", "on", "off", "over", "under", "again", "further", "then", 
"once", "here", "there", "when", "where", "why", "how", "all", 
"any", "both", "each", "few", "more", "most", "other", "some", 
"such", "no", "nor", "not", "only", "own", "same", "so", "than", 
"too", "very", "s", "t", "can", "will", "just", "don", "should", 
"now"]

_stopwords = ChainMap(*(stemmer._lookup[s]
                        for s in chain(stems[:top+14], 
                                       [stemmer.stem(s).unit for s in nltkwords]
                                      )
                       ))

stopwords = sorted(_stopwords.keys())
'''
pass

In [None]:
#stopwords

In [None]:
#len(stopwords)


In [None]:
word_drop_by_type = {'ISA':.2, 'PFR':.1, 'DPFR':.3}
idx_drop_by_type = {}
for t in documents_by_type:
    data = documents_by_type[t][['GLOMUNSTEM', 'GLOMPRE']]
    info = data.applymap(
        lambda s: len(str(s).split())
    )
    idx = info.GLOMUNSTEM / info.GLOMPRE < word_drop_by_type[t]
    idx_drop_by_type[t] = idx
    #print(t, data[idx].shape)
    #display(data[idx])
    #documents_by_type[t] = documents_by_type[t][~idx]

In [None]:
for t in documents_by_type:
    display(t)
    display(list(documents_by_type[t][idx_drop_by_type[t]].Anomaly_ID))

In [None]:
for t in documents_by_type:
    info = documents_by_type[t][['GLOMUNSTEM', 'GLOMPRE']].applymap(
        lambda s: len(str(s).split())
    )
    idx = info.GLOMUNSTEM < 10
    print(sum(idx))
#    display(documents_by_type[t][idx].GLOMPRE)
    documents_by_type[t] = documents_by_type[t][~idx]

In [None]:
out = Path('./../output')

In [None]:
import pickle

with open(out / 'stemmer.pyo', 'wb') as fd:
    pickle.dump(file=fd, obj=stemmer)

In [None]:



for t in documents_by_type:
    documents_by_type[t][
        ['Anomaly_ID', 'GLOMUNSTEM', 'Project_Code', 'GLOMPRE'] #+ author_columns[:-1]
    ].to_csv(out / f'norm_{t}.csv', index=False)

In [None]:

with open(out / 'stemmer.pyo', 'rb') as fd:
    tump = pickle.load(fd)
    
tump._lookup