In [None]:
from csv import DictReader
from collections import Counter
from functools import partial
from math import isnan
from pathlib import Path
import re
import sys

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
from pandas import DataFrame, read_csv, concat, isna
import seaborn as sns


%matplotlib notebook
%precision 4

BASEDIR = Path('../data')

In [None]:
with open(BASEDIR / 'prs.csv') as fd:
    df = read_csv(fd, low_memory=False)

with open(BASEDIR / 'prs_signatures.csv') as fd:
    ef = read_csv(fd, low_memory=False)
df.shape, ef.shape

In [None]:
sum(isna(ef['Middle_Name']))

In [None]:
censored_authors = [
    43660., 75254., 75426., 76744., 78939., 80050., 82903., 83760., 
    43660,  75254,  75426,  76744,  78939,  80050,  82903,  83760,
]
idx = ef.applymap(lambda i: i in censored_authors).any(axis=1)
display(ef[idx])
ef = ef[~idx]


idx = df.applymap(lambda i: i in censored_authors).any(axis=1)
display(df[idx])
df = df[~idx]

In [None]:
report_types = df.ReportType.unique()
documents_by_type = \
    {t:df.loc[df.ReportType==t].dropna(axis=1, how='all')
     for t in report_types}

signers_by_type = \
    {t:ef.loc[ef.ReportType==t].dropna(axis=1, how='all')
     for t in report_types}


labels_by_type = \
    {t:np.array(documents_by_type[t].columns)
     for t in report_types}

In [None]:
censored = [] #\
#    ['AlertConcern', 'CauseCodes', 'DateClosed', 
#     'Description', 'FailureCause', 'CountID',
#     'Disposition', 'ExecutiveSummary', 'GroundSWVersion',
#     'FlightSWVersion', 
#    ]

for t in labels_by_type:
    candidates = (l for l in labels_by_type[t] if l not in censored)
    print(f'{t} ::', *candidates, sep='\n  ')

In [None]:
# find free fields

def get_label_diversity(df):
    
    diversity = np.array(
        [df[l].nunique() / df[l].dropna().count() 
         for l in df.columns]
    )
    return diversity

def get_label_population(df):
    rows, cols = df.shape
    population = np.array(df.count()/rows)
    return population

diverse_upper, diverse_lower = 1., .00
population_thresh = .0
suspects_by_type = dict()

for t in report_types:
    labels = labels_by_type[t]
    documents = documents_by_type[t]
    rows, cols = documents.shape

    label_diversity = get_label_diversity(documents)
    ddx = (label_diversity < diverse_upper) & (label_diversity > diverse_lower)
    label_population = get_label_population(documents)
    pdx = label_population > population_thresh

    idx = ddx & pdx

    suspects_by_type[t] = labels[idx]
    #print(f'{t} ({rows}) [label, population, diversity]:', *zip(
    #    labels[idx],
    #    #np.round(label_population[idx], 3),
    #    np.round(label_diversity[idx], 3)
    #), sep='\n  ')
    
pass

In [None]:
for t in documents_by_type:
    print(documents_by_type[t].dtypes.unique())

In [None]:
censored = [ # remove columns that are definately not author identifiers
    'CorrectiveAction', 'CorrectiveActionHrs', 'DateClosed',
    'Description', 'Effectivity', 'FailureEffectRating',
    'FailureCause', 'CauseCodes', 'AlertConcern', 
    'Status', 'FlightSWVersion', 'LastProcessed',
    'HardwareSafety', 'Mail_Stop', 'MainItemAffected',
    'LessonsLearned', 'MissionCriticalFailure', 'MissionPhaseAffected',
    'SpecificEnvironment', 'SafetyReviewStatus', 'SuspectedProblemArea',
    'Telephone_Number', 'Title', 'VerificationAnalysis', 
    'ReportingLocation', 'ReportType', 'REV',
    'Project_Name', 'TestVerification', 'ProjectPhase',
    'WorkstationName', 'ExecutiveSummary', 'GroundSWVersion',
    'OriginationDate', 'SuspectedCause', 'Issues',
    'Reproducible', 'CogEClosurePlan', 'assignElement',
    'ProblemFailureDate', 'Phase', 'PersonnelSafety',
    'ProblemType', 'Procedure', 'ProblemFailureNotedDuring',
    'Project_Code', 'Disposition', 'OperatingSystemVersion',
    'Paragraph', 'OperatingSystem', 'AnalysisImpacts',
    'SystemContractor', 'CmdFileError', 'ResidualRisk',
    'SubsystemName', 'Rationale', 'ProgramName',
    'anpro', 'Priority', 'MissionActivity',
    'Location', 'MissionPhase', 'InitialCritValue',
    'InitialCrit', 'ISACauseCodes2016', 'TestResultsVerification',
    'VerificationActHours', 'FlightProjectConcurrence', 'MultipleTeams',
    'LessonsLearnedCandidate', 'DateOfIncident', 'CritRating',
    'CmdFileCategory', 'CmdFileCorrectiveAction', 'CmdFileErrorCauses',
    'CmdFileUplinkProcessLocations', 'CmfFileErrorDescription', 'CmfFileRootCause',
    'ISACauseCodes', 'DateRequiredBy', 'HWSWItem',
    'CommandProcessRelated', 'CmfFileCorrectiveAction', 'CmfFileProximateCause',
    'CmfFileContributingCause', 'AdministrativeComment', 'Project_ID'
]
for t in reversed(list(documents_by_type.keys())):
    display(documents_by_type[t].select_dtypes(include=['O']).drop(censored, axis=1, errors='ignore').head(n=50))


In [None]:
t = 'ISA'
tump = documents_by_type[t].select_dtypes(include=['O']).drop(censored, axis=1, errors='ignore')
tump.fullname.dropna()

In [None]:
print(*tump.columns, sep='\n')

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
censored = censored + [
    'CognizantSection', 'CountID', 'EstimatedHrsToComplete',
    'FailureCauseValue', 'FailureEffectRatingValue', 'TestActHours',
    'VerificationAnalysisHours', 'CritRatingValue', 'AnalysisImpactsHours'
]
for t in documents_by_type:
    display(documents_by_type[t].select_dtypes(include=numerics).drop(censored, axis=1, errors='ignore').head())



In [None]:
for t in documents_by_type:
    display(documents_by_type[t].drop(censored, axis=1, errors='ignore').dropna().head())


In [None]:
signers_by_type[t].head()

In [None]:
authors_by_type = dict()

Key = ['Anomaly_ID', 'ResponsibleEditorUserId', 'OriginatorUserID', 'AssigneeUserID']

for t in documents_by_type:
    authors_by_type[t] = documents_by_type[t][Key].melt(
        ['Anomaly_ID'], 
        var_name='UserRoleName', 
        value_name='Users_ID'
    ).dropna()

    authors_by_type[t].Users_ID = authors_by_type[t].Users_ID.astype(int)
    authors_by_type[t] = authors_by_type[t].\
        replace('ResponsibleEditorUserId', 'RESPONSIBLE EDITOR').\
        replace('AssigneeUserID', 'ASSIGNEE').\
        replace('OriginatorUserID', 'ORIGINATOR')

    authors_by_type[t] = \
        concat([authors_by_type[t], signers_by_type[t][['Anomaly_ID', 'Users_ID', 'UserRoleName']]], sort=True).\
        sort_values(['Users_ID', 'Anomaly_ID'])
    
    authors_by_type[t] = \
        authors_by_type[t][ # as informed by Bruce
          (authors_by_type[t]['UserRoleName'] != 'DEVELOPER') &
          (authors_by_type[t]['UserRoleName'] != 'EDITOR') &
          (authors_by_type[t]['UserRoleName'] != 'ORIGINATOR') &
          (authors_by_type[t]['UserRoleName'] != 'CONDITIONAL APPROVER')
        ]
    
    authors_by_type[t]['_Anomaly_ID'] = 'A' + authors_by_type[t]['Anomaly_ID'].apply(str)
    authors_by_type[t]['_Users_ID'] = 'U' + authors_by_type[t]['Users_ID'].apply(str)
    
    # XXX PMR: Apparently there are a lot of duplicate assignments. This now
    #  ignores the 'UserRoleName'
    #authors_by_type[t].drop_duplicates(['Anomaly_ID', 'Users_ID'], inplace=True)

    print(
        t, 
        f'AUTHORS   - {authors_by_type[t].shape[0]}', 
        f'ANOMALIES - {authors_by_type[t].Anomaly_ID.unique().shape[0]}',
        sep='\n  :'
    )
    print(
        'Author Types', 
        *authors_by_type[t].UserRoleName.unique(),
        sep='\n  :'
    )
    display(authors_by_type[t].head())


In [None]:
for t in authors_by_type:
    authors_by_type[t][['Anomaly_ID', 'UserRoleName', 'Users_ID']].to_csv(f'{t}_experts.csv', index=False)

In [None]:
MINIMUM_DOCUMENTS = 0

for document_type in authors_by_type:
    count_authors = authors_by_type[document_type]['Users_ID']
    sufficient = {
        author
        for author, count in Counter(count_authors).items()
        if count >= MINIMUM_DOCUMENTS
    }
    idx = authors_by_type[document_type]['Users_ID'].isin(sufficient)
    authors_by_type[document_type] = authors_by_type[document_type][idx]

    print(
        document_type, 
        f'AUTHORS   - {authors_by_type[document_type].shape[0]}', 
        f'ANOMALIES - {authors_by_type[document_type].Anomaly_ID.unique().shape[0]}',
        sep='\n  :'
    )
    print(
        'Author Types', 
        *authors_by_type[t].UserRoleName.unique(),
        sep='\n  :'
    )
    display(authors_by_type[t].head())
    

In [None]:
for t in authors_by_type:
    authors_by_type[t]['ReportType'] = t

concat(authors_by_type.values()).to_csv(BASEDIR / 'processed_authors.csv', index=False)

In [None]:
for t in authors_by_type:
    print(t, len(authors_by_type[t].Anomaly_ID.unique()))

In [None]:
import networkx as nx
from itertools import product

graph_by_type = {}

for t in authors_by_type:
    G = nx.from_pandas_edgelist(
        authors_by_type[t], 
        edge_attr='UserRoleName', 
        source='_Users_ID', 
        target='_Anomaly_ID').to_undirected()

    #display(nx.draw(G, with_labels=False, node_size=1))


    for a in authors_by_type[t]['_Anomaly_ID'].unique():
        users = list(nx.all_neighbors(G, a))
        G.add_edges_from(product(users, users))

    for a in authors_by_type[t]['_Anomaly_ID'].unique():
        G.remove_node(a)
        
    graph_by_type[t] = G


In [None]:
for t in graph_by_type:
    _ = plt.figure()
    nx.draw(graph_by_type[t], node_size=1, title=t)
    plt.title(t)
    plt.savefig(f'{t}_authorship_graph.png')
