In [None]:
from __future__ import print_function, division#, unicode_literals

import re
import json
from pathlib2 import Path

import pandas as pd
import numpy as np
import scipy
import nltk

import matplotlib
import matplotlib.pyplot as plt

%matplotlib notebook
matplotlib.style.use('ggplot')
%precision 4

#NASA color palette
nasa = {'red':'#fc3d21','blue':'#0b3d91','grey':'#79797c','black':'#000000'}

BASEDIR = Path('../data')

# Data

### Import PRS pickle

In [None]:
df_prs = pd.read_csv(BASEDIR / 'prs.csv', low_memory=False)

# Format and subset data

In [None]:
#Freeform text columns that potentially contain "safing"
# cols_ff_text = ['title','description','correctiveAction','verificationAnalysis','issues','relatedDocuments',
#                 'analysisImpacts','attachedFiles','testVerification','executiveSummary','procedure','rev',
#                 'cogEClosurePlan','paragraph','rationale','cmfFileErrorDescription','cmfFileContributingCause',
#                 'cmfFileProximateCause','cmfFileCorrectiveAction','cmfFileRootCause']

cols_ff_text = ['Title','Description']

### Combine the text in free-form text fields

In [None]:
#df_prs.columns = df_prs.columns.str.replace('Title', 'title').replace('Description', 'Description')
df_prs.MainItemAffected

In [None]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

def get_words(row):
    #Join all text in a report into a single string
    words = ''
    for col in cols_ff_text:
        try:
            words += ' ' + row[col]
        except TypeError:
            continue
    return words

'''
    #Replace any punctuation, special characters, etc. with whitespace     
    #words = re.sub('\\r|\\n|<br>|&quot;|[0-9]|\.|,|:|;|\(|\)|\[|\]|{|}|<|>|"|=|\*+|- +',' ',words)
    #Split words based on whitespace
    words = words.split()
    #Push words to lowercase
    #words = [word.lower() for word in words]
    #Set stemmer and use it to stem individual words
    #st = LancasterStemmer()
    #words = [st.stem(word) for word in words]
    #Remove stopwords
    #words = [word for word in words if word not in stopwords.words('english')]
    
    #Return a string of cleaned words
    return ' '.join(words)
'''

#Apply to main subset
df_prs['words'] = df_prs.apply(get_words, axis=1)

df_prs.shape

### Create dataframe to model on which includes tiered structure

In [None]:
#Load tier structure data for MSL and M2020
df_msl = pd.read_csv('../Data/PRS_MSL_Tier_Structure_160609.csv')
print(df_msl.shape)

df_m2020 = pd.read_csv('../Data/PRS_M2020_Tier_Structure_160609.csv')
print(df_m2020.shape)

#Create simplified 'item_number' feature
df_msl['item_number'] = df_msl.apply(lambda row: re.sub('MSL[ -_]' , '', row['Item_Number']), axis=1)
df_m2020['item_number'] = df_m2020.apply(lambda row: re.sub('M2020[ -_]' , '', row['Item_Number']), axis=1)

#Create columns for tiers
def create_tier_features(df,tier_max):
    #Create empty tier columns    
    for i in range(tier_max):
        df['tier_{0}'.format(i)] = ''

    #Fill in tier columns with values from "Item_Acronym"
    for index,row in df.iterrows():
        for i,tier in enumerate(re.split('-',row['Item_Acronym'])):
            df.ix[index,'tier_{0}'.format(i)] = tier
    
create_tier_features(df_msl, 8)
create_tier_features(df_m2020, 8)

#Subset PRS for MSL data
df_prs_msl = df_prs.ix[(df_prs['Project_Name']=='Mars Science Lab')&(df_prs['ReportType']=='PFR')].copy()

#Values to drop from df_msl; insure that the join is 1-to-1
#Option 1 - drop rows with duplicate "Lifecycle_ID" via "item_number"
#Option 2 - guess, check with Leslie later
drop_item_numbers = ['MSL 2000_FS','MSL 2009CABL']
drop_item_acronym = ['FS-AVS-MCA-RMCA-BTE']
drop_item_life_cycle_id = [16592]

#Subset tier structure to remove values that produce duplicates when join is performed
df_msl = df_msl[(~df_msl['Item_Number'].isin(drop_item_numbers))&
                (~df_msl['Item_Acronym'].isin(drop_item_acronym))&
                (~df_msl['Lifecycle_ID'].isin(drop_item_life_cycle_id))]

#Perform join
df_prs_msl = pd.merge(left=df_prs_msl,right=df_msl,how='left',left_on='MainItemAffected',right_on='Item_name')
print(df_prs_msl.shape)

#Subset PRS for M2020 data
df_prs_m2020 = df_prs.ix[(df_prs['Project_Name']=='MARS 2020')&(df_prs['ReportType']=='PFR')].copy()

#Perform join
df_prs_m2020 = pd.merge(left=df_prs_m2020,right=df_m2020,how='left',left_on='MainItemAffected',right_on='Item_name')
print(df_prs_m2020.shape)

#Concat the dfs
df_modeling = pd.concat([df_prs_msl,df_prs_m2020])

#Reset index
df_modeling = df_modeling.reset_index(drop=True)

print(df_modeling.shape)

df_modeling = df_prs.ix[(df_prs['projectName'].isin(['Mars Science Lab','MARS 2020']))&
                        (df_prs['reportType']=='PFR')].copy()
df_modeling = df_modeling.reset_index()

df_modeling.shape

# Set up for LDA

### Determine how many topics to select based on perplexity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, LatentDirichletAllocation

import lda as LatentDA

In [None]:
#Get MSL ISAs
df_modeling = df_prs[(df_prs['Project_Name']=='Mars Science Lab')&(df_prs['ReportType']=='ISA')&(~df_prs['Description'].isnull())]

In [None]:
#Add additional words to the list of English stop words
additional_stop_words = ['test','tests','testing','tested','pfr','isa','quot','jpl','msl']
additional_stop_words += ['08','09','10','11','2010','2011','2014']
stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)

#Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=2000,
                                stop_words=stop_words)
tf = tf_vectorizer.fit_transform(df_modeling['words'])

In [None]:
#Create base df
df_perplexity = pd.DataFrame()

#Specify parameters
grid_lda = {5:[3] + list(np.arange(5,101,5)),
            25:sorted([3, 5, 15] + list(np.arange(10,101,10))),
            100: sorted([3, 5, 10, 15] + list(np.arange(20,101,20))),
            250:sorted([3, 5, 15] + list(np.arange(10,101,10))),
            500:sorted([3, 5, 15] + list(np.arange(10,101,10)))}

for iters in grid_lda:
    for topics in grid_lda[iters]:
        print ("Fitting {0} topics over {1} iterations...".format(topics, iters))
        lda = LatentDirichletAllocation(n_topics=topics, max_iter=iters,learning_method='online', learning_offset=50.,random_state=0)
        lda.fit(tf)
        temp_perplexity = lda.perplexity(tf)
        df_perplexity = pd.concat([df_perplexity, pd.DataFrame([[iters, topics, temp_perplexity]], columns=["iters","topics","perplexity"])])

In [None]:
#Plot
ax = df_perplexity.ix[df_perplexity['iters']==5,['topics','perplexity']].plot(x='topics', figsize=(15,10), alpha=0.7)

for i in [5,25,100,250,500]:
    if i != 5:
        df_perplexity.ix[df_perplexity['iters']==i,['topics','perplexity']].plot(x='topics', ax=ax, alpha=0.7)
    df_temp_min = df_perplexity[df_perplexity['perplexity']==df_perplexity.ix[df_perplexity['iters']==i,'perplexity'].min()]
    df_temp_min.plot(kind='scatter',x='topics',y='perplexity',ax=ax, s=150)
    print("Min perplixity of {0:0.02f} at {1} topics with {2} iterations".format(df_temp_min['perplexity'].values[0], df_temp_min['topics'].values[0], df_temp_min['iters'].values[0]))

In [None]:
#Save perplexity df
# df_perplexity.to_csv('msl_description_perplexity.csv', index=False)

### Fit topic model 

In [None]:
#Define attributes for feature extraction (TF, TF-IDF) and LDA
n_topics = 30
max_iter = 500

#USE SKLEARN LDA 
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=max_iter,learning_method='online', learning_offset=50.,random_state=0)
lda.fit(tf)

In [None]:
for idx, topic in enumerate(lda.components_):
    print(idx, topic.argsort()[:-5 - 1:-1])

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print( "Topic #{0}:".format(topic_idx))
        print( " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print( "")

#Print topics
print( "\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
#print_top_words(lda, tf_feature_names, n_top_words)
print_top_words(lda, tf_feature_names, 50)
print( lda.perplexity(tf))

### Set parameters

In [None]:
#Set Anomaly ID of interest
anomaly_id = 57049
#anomaly_id = 57069 #Carry over from - 13068
anomaly_id = 57070 #Carry over from - 14612
anomaly_id = 54892 #Carry over from - 15426

#Get index
pfr_index = df_modeling[df_modeling['anomalyID']==anomaly_id].index[0]

#Tope N reports to keep
top_n = 3

#Distance columns
dist_cols = ['euclidean_dist_to_{0}'.format(anomaly_id),
             'cosine_dist_to_{0}'.format(anomaly_id),
             'kullback_dist_to_{0}'.format(anomaly_id)]

print( anomaly_id)
print( pfr_index)
print( df_modeling.ix[df_modeling['anomalyID']==anomaly_id,'Item_Acronym'].values[0])

### Top n closest documents - no restrictions

In [None]:
#Create distance df
df_dist = pd.DataFrame()

#Create base condition to build off of
df_temp = df_modeling[df_modeling['projectName']=='Mars Science Lab']

#Iterate through subsetted data
for index,row in df_temp.iterrows():
    #Calculate distances
    euclidean_dist = scipy.spatial.distance.euclidean(lda.transform(tf[pfr_index]), lda.transform(tf[index]))
    cosine_dist = scipy.spatial.distance.cosine(lda.transform(tf[pfr_index]), lda.transform(tf[index]))
    kullback_dist = scipy.stats.entropy(lda.transform(tf[pfr_index]).T, lda.transform(tf[index]).T)[0]
    #Create item_acronym based on location in traverse
    temp_item_acronym = row['Item_Acronym']
    cols = ['anomaly_id','tier','item_acronym'] + dist_cols
    temp_df = pd.DataFrame([[row['anomalyID'], np.NaN, temp_item_acronym, euclidean_dist, cosine_dist, kullback_dist]], columns=cols)
    df_dist = pd.concat([df_dist,temp_df])

In [None]:
#Subset distance df 
df_dist_sub = pd.DataFrame()

#Get top n 
for col in dist_cols:
    df_dist_sub = pd.concat([df_dist_sub, df_dist.sort_values(col).head(top_n)])
        
#Create linkable url field
df_dist_sub['url'] = df_dist_sub.apply(lambda row:"https://prs.jpl.nasa.gov/NET/PFRReadOnly.aspx?smode=pop&iAnomalyID={0}".format(row['anomaly_id']), axis=1)        

for index,row in df_dist_sub.iterrows():
    print "{0}:\t{1}".format(row['anomaly_id'] ,row['url'])
    
df_dist_sub

### Top n closest documents - traverse tier structure

In [None]:
#Create distance df
df_dist_tier = pd.DataFrame()

#Create base condition to build off of
conditoinal_str = ""
conditoinal_str = "(df_modeling['projectName']=='Mars Science Lab')&"

#Get tier structure string based on provided Anomaly ID
item_acronym = df_modeling.ix[df_modeling['anomalyID']==anomaly_id,"Item_Acronym"].values[0]
item_acronym_list = re.split('-',item_acronym)

#Traverse tier structure, increasing the specificity of conditions with each additional tier
for i,tier in enumerate(item_acronym_list):
    #Create new condition from current tier
    temp_conditoinal_str = "(df_modeling['tier_{0}']=='{1}')".format(i, tier)
    #If not the first tier add an "&"
    if i > 0:
        temp_conditoinal_str = "&{0}".format(temp_conditoinal_str)
    conditoinal_str += temp_conditoinal_str
    eval_str = "df_modeling[{0}]".format(conditoinal_str)
    df_temp = eval(eval_str)

    #Iterate through subsetted data
    for index,row in df_temp.iterrows():
        #Calculate distances
        euclidean_dist = scipy.spatial.distance.euclidean(lda.transform(tf[pfr_index]), lda.transform(tf[index]))
        cosine_dist = scipy.spatial.distance.cosine(lda.transform(tf[pfr_index]), lda.transform(tf[index]))
        kullback_dist = scipy.stats.entropy(lda.transform(tf[pfr_index]).T, lda.transform(tf[index]).T)[0]
        #Create item_acronym based on location in traverse
        temp_item_acronym = "-".join(item_acronym_list[:i+1])
        cols = ['anomaly_id',
                'tier',
                'item_acronym',
                'euclidean_dist_to_{0}'.format(anomaly_id),
                'cosine_dist_to_{0}'.format(anomaly_id),
                'kullback_dist_to_{0}'.format(anomaly_id)]
        temp_df = pd.DataFrame([[row['anomalyID'], i, temp_item_acronym, euclidean_dist, cosine_dist, kullback_dist]], columns=cols)
        df_dist_tier = pd.concat([df_dist_tier,temp_df])

In [None]:
print anomaly_id

#Subset distance df 

df_dist_tier_sub = pd.DataFrame()
dist_cols = ['euclidean_dist_to_{0}'.format(anomaly_id),'cosine_dist_to_{0}'.format(anomaly_id),'kullback_dist_to_{0}'.format(anomaly_id)]
for item in df_dist_tier['item_acronym'].unique():
    df_temp = df_dist_tier[df_dist_tier['item_acronym']==item]
    for col in dist_cols:
        df_dist_tier_sub = pd.concat([df_dist_tier_sub, df_temp.sort_values(col).head(top_n)])
        
#df_dist_tier_sub['url'] = df_dist_tier_sub.apply(lambda row:"https://prs.jpl.nasa.gov/NET/PFRReadOnly.aspx?smode=pop&iAnomalyID={0}".format(row['anomaly_id']), axis=1)
df_dist_tier_sub

### Environment and when failure happened -- THIS DOESN'T WORK, LOOK AT 57070 AND THE ENV FAILURE RETURN VALUES

In [None]:
#TEST
environment = df_modeling.ix[df_modeling['anomalyID']==anomaly_id,'specificEnvironment'].values[0]
failed_during = df_modeling.ix[df_modeling['anomalyID']==anomaly_id,'problemFailureNotedDuring'].values[0]

print "Anomaly ID: {0}".format(anomaly_id)
print "Environment: {0}".format(environment)
print "Failure Noted During: {0}".format(failed_during)

In [None]:
#Create distance df
df_dist_env = pd.DataFrame()

#Create base condition to build off of
conditoinal_str = ""
conditoinal_str = "(df_modeling['projectName']=='Mars Science Lab')"

conditional_env_str = "(df_modeling['specificEnvironment']=='{0}')".format(environment)
conditional_during_str = "(df_modeling['problemFailureNotedDuring']=='{0}')".format(failed_during)




#Traverse tier structure, increasing the specificity of conditions with each additional tier
for i in range(3):
    if i == 0:
        eval_str = "df_modeling[{0}&{1}]".format(conditoinal_str, conditional_env_str)
        code = 'env'
    elif i == 1:
        eval_str = "df_modeling[{0}&{1}]".format(conditoinal_str, conditional_during_str)
        code = 'during'
    else:
        eval_str = "df_modeling[{0}&{1}&{2}]".format(conditoinal_str, conditional_env_str, conditional_during_str)
        code = 'env&during'
    df_temp = eval(eval_str)

    #Iterate through subsetted data
    for index,row in df_temp.iterrows():
        #Calculate distances
        euclidean_dist = scipy.spatial.distance.euclidean(lda.transform(tf[pfr_index]), lda.transform(tf[index]))
        cosine_dist = scipy.spatial.distance.cosine(lda.transform(tf[pfr_index]), lda.transform(tf[index]))
        kullback_dist = scipy.stats.entropy(lda.transform(tf[pfr_index]).T, lda.transform(tf[index]).T)[0]
        #Create item_acronym based on location in traverse
        cols = ['anomaly_id',
                'tier',
                'item_acronym',
                'euclidean_dist_to_{0}'.format(anomaly_id),
                'cosine_dist_to_{0}'.format(anomaly_id),
                'kullback_dist_to_{0}'.format(anomaly_id)]
        temp_df = pd.DataFrame([[row['anomalyID'], i, code, euclidean_dist, cosine_dist, kullback_dist]], columns=cols)
        df_dist_env = pd.concat([df_dist_env,temp_df])

In [None]:
print anomaly_id

top_n = 1

#Subset distance df 
df_dist_env_sub = pd.DataFrame()
dist_cols = ['euclidean_dist_to_{0}'.format(anomaly_id),'cosine_dist_to_{0}'.format(anomaly_id),'kullback_dist_to_{0}'.format(anomaly_id)]

for item in df_dist_env['item_acronym'].unique():
    df_temp = df_dist_env[df_dist_env['item_acronym']==item]
    for col in dist_cols:
        df_dist_env_sub = pd.concat([df_dist_env_sub, df_temp.sort_values(col).head(top_n)])
        
df_dist_env_sub['url'] = df_dist_env_sub.apply(lambda row:"https://prs.jpl.nasa.gov/NET/PFRReadOnly.aspx?smode=pop&iAnomalyID={0}".format(row['anomaly_id']), axis=1)

for index,row in df_dist_env_sub.iterrows():
    print "{0}:\t{1}\t{2}".format(row['anomaly_id'] , row['item_acronym'], row['url'])

df_dist_env_sub

In [None]:
df_modeling.ix[(df_modeling['projectName']=='MARS 2020')&(~df_modeling['failureEffectRatingValue'].isnull()),
               ['anomalyID','failureEffectRatingValue','projectName','title']].sort_values('failureEffectRatingValue',ascending=False)

In [None]:
[col + anomaly_id for col in ['euclidean_dist_to_','cosine_dist_to_','kullback_dist_to_']]

In [None]:
#Anomaly ID to compare against
anomaly_id = 57069
#anomaly_id = 58481

#Get the index for the 
pfr_index = df_modeling.ix[df_modeling['anomalyID']==anomaly_id].index[0]

df_dist = pd.DataFrame()

for index,row in df_modeling.iterrows():
    euclidean_dist = scipy.spatial.distance.euclidean(lda.transform(tf[pfr_index]), lda.transform(tf[index]))
    cosine_dist = scipy.spatial.distance.cosine(lda.transform(tf[pfr_index]), lda.transform(tf[index]))
    kullback_dist = scipy.stats.entropy(lda.transform(tf[pfr_index]).T, lda.transform(tf[index]).T)[0]
    cols = ['anomaly_id',
            'euclidean_dist_to_{0}'.format(anomaly_id),
            'cosine_dist_to_{0}'.format(anomaly_id),
            'kullback_dist_to_{0}'.format(anomaly_id)]
    temp_df = pd.DataFrame([[row['anomalyID'], euclidean_dist, cosine_dist, kullback_dist]], columns=cols)
    df_dist = pd.concat([df_dist,temp_df])

In [None]:
df_dist.sort_values('euclidean_dist_to_{0}'.format(anomaly_id)).reset_index(drop=True).head(10)

In [None]:
df_dist.sort_values('cosine_dist_to_{0}'.format(anomaly_id)).reset_index(drop=True).head(10)

In [None]:
df_dist.sort_values('kullback_dist_to_{0}'.format(anomaly_id)).reset_index(drop=True).head(10)

# TEST

In [None]:
#TEST
#print "\nTopics in LDA model:"
tf_feature_names = tf_vectorizer.get_feature_names()
#print_top_words(lda, tf_feature_names, n_top_words)
#print lda.perplexity(tf)
d1 = tf[0]
print [tf_feature_names[i] for i in d1.indices]
df_modeling.ix[0]

In [None]:
#Load tier structure data for MSL and M2020

df_msl = pd.read_csv('../Data/PRS_MSL_Tier_Structure_160609.csv')
print df_msl.shape

df_m2020 = pd.read_csv('../Data/PRS_M2020_Tier_Structure_160609.csv')
print df_m2020.shape

In [None]:
#Create simplified 'item_number' feature
df_msl['item_number'] = df_msl.apply(lambda row: re.sub('MSL[ -_]' , '', row['Item_Number']), axis=1)
df_m2020['item_number'] = df_m2020.apply(lambda row: re.sub('M2020[ -_]' , '', row['Item_Number']), axis=1)

In [None]:
#Function that provides high-level descriptive counts 
def project_tier_structure_description(df, project_name, report_type):

    print "---- {0} ----".format(project_name)
    print "Unique items in tier structure: {0}".format(df['Item_name'].nunique())
    print "Unique items in {0}: {1}".format(report_type, df_prs.ix[df_prs['projectName']==project_name,'mainItemAffected'].nunique())

    #Unique items from msl prs
    items_prs = df_prs.ix[df_prs['projectName']==project_name,'mainItemAffected'].unique()
    items_prs_set = set(items_prs[~pd.isnull(items_prs)])
    items_prs_set = set([str(i) for i in items_prs_set])

    #Unique items from msl tier structure
    items_tier_struct = df['Item_name'].unique()
    items_ts_set = set(items_tier_struct[~pd.isnull(items_tier_struct)])

    #Check for superset/subset
    print "\nAll items in PRS project contained in tier structure: {0}".format(items_ts_set.issuperset(items_prs_set))
    #print items_prs_set.issubset(items_ts_set)

    #Get the items not in tier structure and the unique count
    if not items_ts_set.issuperset(items_prs_set):
        x = sorted(items_prs_set.difference(items_ts_set))
    print "Number of unique items not in tier structure: {0}".format(len(x))

    #Total number of reports for a project
    total_reports = df_prs.ix[(df_prs['projectName']==project_name)&(df_prs['reportType']==report_type)].shape[0]
    
    #Total number of reports for a project that reference a part in the tier structure
    total_reports_with_ts_item = df_prs.ix[(df_prs['projectName']==project_name)&
                                           (df_prs['reportType']==report_type)&
                                           (df_prs['mainItemAffected'].isin(items_ts_set))].shape[0]

    print "\nTotal {0} {1}s: {2}".format(project_name, report_type, total_reports)
    print "Total {0} {1}s with item in Tier Structure: {2}".format(project_name, report_type, total_reports_with_ts_item)
    print "Total {0} {1}s with item NOT in Tier Structure: {2}".format(project_name, report_type, total_reports - total_reports_with_ts_item)
    print 
    
    return x
    
x = project_tier_structure_description(df_msl, "Mars Science Lab", "PFR")
#project_tier_structure_description(df_msl, "Mars Science Lab", "DPFR")
#project_tier_structure_description(df_msl, "Mars Science Lab", "ISA")

x2 = project_tier_structure_description(df_msl, "MARS 2020", "PFR")
#project_tier_structure_description(df_msl, "MARS 2020", "DPFR")
#project_tier_structure_description(df_msl, "MARS 2020", "ISA")

In [None]:
x

In [None]:
df_prs.ix[(df_prs['projectName']=="Mars Science Lab")&
                                           (df_prs['reportType']=="PFR")&
                                           (df_prs['mainItemAffected']=="3/4 Pyro Valve"), 'anomalyID']

### Item_Acronym - Tier Structure

In [None]:
print df_msl.shape
print df_m2020.shape
print

print df_msl['Item_Acronym'].nunique()
print df_m2020['Item_Acronym'].nunique()
print

print df_msl[df_msl['Item_Acronym'].isin(df_m2020['Item_Acronym'].unique())].shape
print df_m2020[df_m2020['Item_Acronym'].isin(df_msl['Item_Acronym'].unique())].shape

df_m2020[~df_m2020['Item_Acronym'].isin(df_msl['Item_Acronym'].unique())].head()

In [None]:
df_m2020[df_m2020['Item_Acronym'].str.contains("- .*")]

In [None]:
df_m2020[df_m2020['Item_Acronym'].str.contains("BRLA")]

In [None]:
#Create columns for tiers
def create_tier_features(df,tier_max):
    #Create empty tier columns    
    for i in range(tier_max):
        df['tier_{0}'.format(i)] = ''

    #Fill in tier columns with values from "Item_Acronym"
    for index,row in df.iterrows():
        for i,tier in enumerate(re.split('-',row['Item_Acronym'])):
            df.ix[index,'tier_{0}'.format(i)] = tier
    
create_tier_features(df_msl, 8)
create_tier_features(df_m2020, 8)

In [None]:
#Subset PRS for MSL data
df_prs_msl = df_prs.ix[(df_prs['projectName']=='Mars Science Lab')&(df_prs['reportType']=='PFR')].copy()

#Values to drop from df_msl; insure that the join is 1-to-1
#Option 1 - drop rows with duplicate "Lifecycle_ID" via "item_number"
#Option 2 - guess, check with Leslie later
drop_item_numbers = ['MSL 2000_FS','MSL 2009CABL']
drop_item_acronym = ['FS-AVS-MCA-RMCA-BTE']
drop_item_life_cycle_id = [16592]

#Subset tier structure to remove values that produce duplicates when join is performed
df_msl = df_msl[(~df_msl['Item_Number'].isin(drop_item_numbers))&
                (~df_msl['Item_Acronym'].isin(drop_item_acronym))&
                (~df_msl['Lifecycle_ID'].isin(drop_item_life_cycle_id))]

#Perform join
df_prs_msl = pd.merge(left=df_prs_msl,right=df_msl,how='left',left_on='mainItemAffected',right_on='Item_name')

print df_prs_msl.shape
print df_temp.shape

#Subset PRS for MSL data
df_prs_m2020 = df_prs.ix[(df_prs['projectName']=='MARS 2020')&(df_prs['reportType']=='PFR')].copy()

#Perform join
df_prs_m2020 = pd.merge(left=df_prs_m2020,right=df_m2020,how='left',left_on='mainItemAffected',right_on='Item_name')

print df_prs_m2020.shape
print df_prs_m2020.shape

#Concat the dfs
df_prs_msl_m2020 = pd.concat([df_prs_msl,df_prs_m2020])
df_prs_msl_m2020 = df_prs_msl_m2020.reset_index(drop=True)

print df_prs_msl_m2020.shape

In [None]:
#Traverse tier structure to give more relevant results

#Set anomaly ID of interest
anomaly_id = df_prs_msl_m2020.ix[4671,"anomalyID"]

#Traverse tier structure, increasing the specificity of conditions with each additional tier
item_acronym = df_prs_msl_m2020.ix[df_prs_msl_m2020['anomalyID']==anomaly_id,"Item_Acronym"].values[0]
conditoinal_str = "(df_prs_msl_m2020['projectName']=='Mars Science Lab')&"
for i,tier in enumerate(re.split('-',item_acronym)):
    print i,tier
    temp_conditoinal_str = "(df_prs_msl_m2020['tier_{0}']=='{1}')".format(i, tier)
    if i > 0:
        temp_conditoinal_str = "&{0}".format(temp_conditoinal_str)
    conditoinal_str += temp_conditoinal_str
    eval_str = "df_prs_msl_m2020[{0}].shape".format(conditoinal_str)
    print eval_str
    print eval(eval_str)

In [None]:
#TEST
df_prs_msl_m2020[(df_prs_msl_m2020['projectName']=='Mars Science Lab')&
                 (df_prs_msl_m2020['tier_1']=='ICheMin')]
df_prs_msl_m2020[df_prs_msl_m2020['tier_1']=='ICheMin']
df_prs_msl_m2020['tier_1'].unique()
df_prs_msl_m2020.ix[df_prs_msl_m2020['anomalyID']==13274,'mainItemAffected']


In [None]:
#Option 1 - drop rows with duplicate "Lifecycle_ID" via "item_number"
#Option 2 - guess, check with Leslie later

print df_msl.shape
print df_msl['Item_name'].nunique()

gb_msl = df_msl.groupby('Item_name').count()
x  = df_msl[df_msl['Item_name'].isin(gb_msl[gb_msl['Lifecycle_Name']>1].index)]

drop_item_numbers = ['MSL 2000_FS','MSL 2009CABL']
drop_item_acronym = ['FS-AVS-MCA-RMCA-BTE']
drop_item_life_cycle_id = [16592]

x[(~x['Item_Number'].isin(drop_item_numbers))&(~x['Item_Acronym'].isin(drop_item_acronym))&(~x['Lifecycle_ID'].isin(drop_item_life_cycle_id))]

sorted(df_msl['Item_name'].unique())

In [None]:
df_temp[(df_temp['tier_0']=='FS')&(df_temp['tier_1']=='AVS')&(df_temp['tier_2']=='MCA')&(df_temp['tier_3'].isin(['DMCA','RMCA']))].groupby('tier_3').count()

df_temp[df_temp['Item_Acronym'].isin(['FS-AVS-MCA-DMCA-BTE','FS-AVS-MCA-RMCA-BTE'])].groupby('tier_3').count()

In [None]:
df_prs_msl[df_prs_msl['mainItemAffected'].isin(gb_msl[gb_msl['Lifecycle_Name']>1].index)].groupby('mainItemAffected').count()

In [None]:
df_prs_msl[df_prs_msl['mainItemAffected']=='MCA Bench Test Equipment']

### Item_name - part?

In [None]:
df_m2020[df_m2020['Item_name'].str.contains("Actuators")]

In [None]:
df_msl[df_msl['Item_Acronym'].str.contains("FS-")]
df_msl[df_msl['Item_name'].str.contains("Actuators")]

In [None]:
print df_msl.shape
print df_msl[df_msl['Item_Number'].str.contains("MSL[ -_]")].shape
#print df_msl.ix[~df_msl['Item_Number'].str.contains("MSL "),'Item_Number']

print df_m2020.shape
print df_m2020[df_m2020['Item_Number'].str.contains("M2020[ -_]")].shape
print df_m2020.ix[~df_m2020['Item_Number'].str.contains("M2020[ -_]"),'Item_Number']



In [None]:
set(df_m2020['item_number'].unique()).difference(set(df_msl['item_number'].unique()))

In [None]:
df_prs[df_prs['projectName']=='Mars 2020']  #.groupby('reportType').count()['_id']
sorted(df_prs['projectName'].unique())

In [None]:
anomaly_id = 49580
df_modeling.ix[df_modeling['anomalyID']==anomaly_id,'words'].values[0]

In [None]:
#Subset data to provide labels for
df_prs_topics = df_prs[df_prs['projectName'].isin(df_part_regex['project_name'].unique())].copy()
print df_prs_topics.shape
print df_prs_topics[df_prs_topics['_id'].isin(df_modeling['_id'])].shape

#Subset data to provide labels for
tf_new = tf_vectorizer.transform(df_prs_topics['words'])

#Get topics for subset
topics_lda_new = lda.transform(tf_new)
topics_lda_new /= topics_lda_new.sum(axis=1).reshape((tf_new.shape[0],1))

#Create a new column for each topic
for i in range(n_topics):
    df_prs_topics["topic_{0}".format(i)] = topics_lda_new[:,i]
    
df_prs_topics.shape

In [None]:
sorted(df_prs['causeCodes'].unique())