In [1]:
import pyodbc
import pandas as pd
from datetime import datetime
from scipy.stats import weibull_min
from scipy.stats import power_divergence

from settings import DEBUG,DATADIR,DATAFILE,DSTYPE,MINGAPSIZE_DAYS,MINGAPSIZE_HOURS
from helper import sql_data, save_df, shorten_rfg

ALL_DATA = pd.DataFrame()
ALL_DATA.name = "ALL_DATA"
if DSTYPE == 'File' :
    ALL_DATA = pd.read_csv(DATADIR + DATAFILE)  #Read all the data
elif DSTYPE == 'Database':
    ALL_DATA = sql_data()

try:
   ALL_DATA = ALL_DATA.drop(columns = ['RELEVANT_BEG_AGE.1','Unnamed: 0'])  # store as pkl and remove extra RELEVANT_BEG_AGE from sql
except:
    print("Superfluous columns removed already")
ALL_DATA.EVENT_DATE_TIME = pd.to_datetime(ALL_DATA.EVENT_DATE_TIME)


ALL_DATA.sort_values(by=['EI_SN','EVENT_DATE_TIME','RELEVANT_BEG_AGE'],inplace=True,ignore_index=True)   ## best bet on Sort by as all the bad data will end up at the 'end' of the day
ALL_DATA['EI_SN'] = ALL_DATA['EI_SN'].astype(str) # This just naturally wants to be an integer.  Thought about Pre-Pending a TN on it.
save_df(ALL_DATA)



# Use the Existing ALL_DATA frame to filter the result set instead of going back to the DB.
FILTERED_DATA = ALL_DATA[
                            (ALL_DATA['SCD2'] != 'X') & 
                            (ALL_DATA['SCD3'] != 'N') & 
                            (ALL_DATA['SCD5'] != 'N')
                        ]
FILTERED_DATA.name = "FILTERED_DATA"  # used to be the AH64E_2013toPres_SCORED.csv  
save_df(FILTERED_DATA)

#  No need for these to be pulled from the SQL.   
AH64E_TailNumList = FILTERED_DATA['EI_SN'].unique()        
AH64E_RFGList     = FILTERED_DATA['RFG'].unique()    #** maybe
AH64E_SCD1List    = FILTERED_DATA['SCD1'].unique()



################################
# Get Gaps per Tail Number
################################
GapFrame = ALL_DATA.copy() #work with a copy, not the orig
GapFrame = GapFrame.drop(columns = ['MAL_EFF', 'CORR_DATE_TIME', 'EI_CORR_AGE', 
                                    'TMMH', 'TMEN', 'TIMH', 'in_phase', 'in_qc', 
                                    'RFG', 'SCD1', 'SCD2', 'SCD3', 'SCD4', 'SCD5', 
                                    'SCD6', 'SCD7', 'SCD8', 'SCD9', 'PRIMARY_EVENT'])
# These next lines can be safely removed
# The original had extra Relevant Beg Age in the query, and storing as CSV gives unnamed instead of index
try:
   GapFrame = GapFrame.drop(columns = ['RELEVANT_BEG_AGE.1','Unnamed: 0'])  # store as pkl and remove extra RELEVANT_BEG_AGE from sql will remove this need
except:
    print("Superfluous columns removed already")

# Convert to Datetime to simplify the calculation.   
GapFrame.EVENT_DATE_TIME = pd.to_datetime(GapFrame.EVENT_DATE_TIME) #Seems to work without error on this dataset.  Will see if it continues

# Let's find the gaps on a per-aircraft basis.  groupby lets us perform the operations by the aircraft
# Add a date difference column and give the difference between this date and the last

GapFrame['datediff']  = GapFrame.groupby('EI_SN')['EVENT_DATE_TIME'].diff() 
# Same for the Flight Hours
GapFrame['hoursdiff'] = GapFrame.groupby('EI_SN')['RELEVANT_BEG_AGE'].diff()

# Add previous hours and date to this row so that we can visually compare.  Not strictly necessary
# but we can use this for debugging later if we want.
GapFrame['previous_date'] = GapFrame.groupby('EI_SN')['EVENT_DATE_TIME'].shift(1)
GapFrame['previous_hours'] = GapFrame.groupby('EI_SN')['RELEVANT_BEG_AGE'].shift(1)

# Filter the GapFrame by min Gap Size *and* min Gap hours (Settings.py).   Save the result to a pkl (saves more of the structure of the Dataframe with column types) 
# Add/change the save types by adjusting the helper library for save_df
TN_GAP_FRAME = GapFrame[
               ((GapFrame.datediff >= pd.Timedelta(str(MINGAPSIZE_DAYS) +  ' days')) & 
                (abs(GapFrame.hoursdiff) >= MINGAPSIZE_HOURS)) ]

GAPS = TN_GAP_FRAME[['EI_SN','previous_date','EVENT_DATE_TIME']]
GAPS = GAPS.rename(columns={"previous_date":'StartGap','EVENT_DATE_TIME':'EndGap'})
GAPS.reset_index(inplace=True) # I throw this in just about everywhere
GAPS.name = 'GAPS'
save_df(GAPS)

#################################################
# Get Usable Intervals from Gaps per Tail Number
#################################################

Intervals = pd.DataFrame()
groups = TN_GAP_FRAME.groupby('EI_SN').groups.keys() #Tail Numbers are the groups
for group in groups :
    # Filter the original dataset to just the tailnumbers we care about and then add a row.
    # We're inverting the gaps to get usable interval for each AC.
    # Gaps = 
    # [previous_date1,EVENT_DATE_TIME1], [previous_date2,EVENT_DATE_TIME2],[previous_date3,EVENT_DATE_TIME3].....
    # So Non-Gaps = 
    # [Min(EVENT_DATE_TIME),previous_date1],[EVENT_DATE_TIME1,previous_date2],[EVENT_DATE_TIME2,previous_date3],[EVENT_DATE_TIME3,max(EVENT_DATE_TIME)].....
    #
    tmpdf = TN_GAP_FRAME[TN_GAP_FRAME['EI_SN']==group]        # Filter by TN
    extrarowdf = tmpdf.iloc[-1:]                              # Copy last row
    tmpdf = pd.concat([tmpdf,extrarowdf], ignore_index=True)  # Add the copy to the end
    tmpdf['StartDtTm'] = tmpdf['EVENT_DATE_TIME'].shift(1)    # Shift the EVENT_DATE_TIME back one.
    tmpdf['EndDtTm']   = tmpdf['previous_date']               # 
    tmpdf['TailNumber'] = tmpdf['EI_SN'].astype('string')
    
    tmpdf.loc[0,'StartDtTm'] = GapFrame[GapFrame['EI_SN'] == group]['EVENT_DATE_TIME'].min() #             Min(EVENT_DATE_TIME)
    tmpdf.loc[tmpdf.index[-1],'EndDtTm'] = GapFrame[GapFrame['EI_SN'] == group]['EVENT_DATE_TIME'].max() # Max(EVENT_DATE_TIME)

    Intervals = pd.concat([Intervals,tmpdf[['TailNumber','StartDtTm','EndDtTm']]],ignore_index=True)  #add each TN Non-Gaps to dataframe.


# Every TN without a gap also needs to be added.  
# For every TN that didn't have a gap, get the min/max time.
leftovers = list(set(AH64E_TailNumList) - set(groups))
for continuous_tn in leftovers:
    tmpdf = ALL_DATA[ALL_DATA['EI_SN']==continuous_tn] 
    tmpdf2 = {'TailNumber': continuous_tn, 'StartDtTm': tmpdf['EVENT_DATE_TIME'].min() , 'EndDtTm': tmpdf['EVENT_DATE_TIME'].max()}  
    Intervals = Intervals.append(tmpdf2, ignore_index=True)

Intervals.reset_index(inplace=True)
Intervals.drop('index',axis=1,inplace=True)
Intervals.name = 'Intervals'
save_df(Intervals)


Superfluous columns removed already


In [3]:
#################################################
# Group Usable Intervals and TNs (Treat each Interval/TN as a separate Helicopter)
# Create an 'alternate tn' so we have no gaps to work around
#################################################


TN_Intervals = ALL_DATA.copy()
#For each interval where we have data
for interval in Intervals.itertuples():
    # Find all records which match the tail and interval in the intervals
    # The syntax is fairly straightforward, but the line gets so long its better to write it like SQL
    TN_Intervals.loc[
        (TN_Intervals['EI_SN'].astype(str) == str(interval.TailNumber)) &           
        (interval.StartDtTm <= TN_Intervals['EVENT_DATE_TIME'] ) & 
        (TN_Intervals['EVENT_DATE_TIME'] <= interval.EndDtTm)         
                               , 'NewTailNumber'] = str(interval.TailNumber) + "_" + str(interval.Index)  #Create another TN but just for separation sake

# Now we have a different 'Tail Number' (NewTailNumber) for each TN/continuous time.  So we no longer have to worry about Gaps in the data.
TN_Intervals = TN_Intervals.sort_values(['NewTailNumber','EVENT_DATE_TIME','RELEVANT_BEG_AGE'])
TN_Intervals.reset_index(drop=True)

save_df(TN_Intervals)

In [12]:
###############################################
# Attemp to correct the data a bit.  


#Find when time was accidentally increased and then decreased to compensate.  
TN_Intervals['timedelta'] = TN_Intervals.groupby('NewTailNumber')['RELEVANT_BEG_AGE'].diff()  # Find all the timedeltas between events
TN_Intervals['dd'] = TN_Intervals.groupby('NewTailNumber')['timedelta'].rolling(2).sum().reset_index(0,drop=True)  # This will find every instance where a time bounced forward/back within 2 events. eith +n/-n or -n/+n
TN_Intervals = TN_Intervals.reset_index(drop=True) # Apply liberally to the forehead
TN_Intervals[(TN_Intervals['timedelta'] < 0) ]  # Should be 1393 are negative on first pass
# TN_Intervals.iloc[945 : 950]
#TN_Intervals.iloc[567661: 567667]   


Unnamed: 0,KEY13,EI_SN,EVENT_DATE_TIME,RELEVANT_BEG_AGE,MAL_EFF,CORR_DATE_TIME,EI_CORR_AGE,TMMH,TMEN,TIMH,...,SCD4,SCD5,SCD6,SCD7,SCD8,SCD9,PRIMARY_EVENT,NewTailNumber,timedelta,dd
948,AH-64E-1009002-20170922-A-00001,1009002,2017-09-22,5730.700195,1,2017-09-29 17:00:00,5737.899902,6.0,2.0,0.0,...,D,S,N,N,N,N,,1009002_152,-4.199707,-4.199707
3454,AH-64E-1009002-20190516-A-00001,1009002,2019-05-16,5938.100098,1,2019-05-20 10:54:00,5953.899902,0.1,1.0,0.0,...,E,S,N,N,N,N,,1009002_152,-2.299805,-2.299805
8256,AH-64E-1009003-20140323-A-00001,1009003,2014-03-23,3569.800049,1,2014-03-24 03:23:00,3569.800049,0.3,1.0,0.2,...,C,S,N,N,N,N,,1009003_1,-0.199951,-0.199951
8262,AH-64E-1009003-20140324-A-00003,1009003,2014-03-24,3569.800049,2,2014-03-24 02:38:00,3569.800049,0.2,1.0,0.3,...,D,S,N,N,N,N,AH-64E-1009003-20140320-A-00007,1009003_1,-0.199951,-0.199951
8282,AH-64E-1009003-20140327-A-00001,1009003,2014-03-27,3570.000000,1,2014-03-28 07:56:00,3570.000000,0.4,1.0,0.1,...,C,S,N,N,N,N,,1009003_1,-5.800049,-1.600098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558160,AH-64E-1803222-20191026-A-00110,1803222,2019-10-26,38.900002,1,2019-10-26 05:18:00,43.299999,1.0,1.0,0.0,...,C,S,N,N,N,N,,1803222_231,-1.899998,-0.500000
564035,AH-64E-1803241-20200508-A-00100,1803241,2020-05-08,188.100006,1,2020-05-08 09:19:00,0.000000,1.0,1.0,0.0,...,D,N,N,N,N,N,AH-64E-1803241-20200504-A-00380,1803241_145,-4.000000,-4.000000
567729,AH-64E-1903251-20200213-A-00100,1903251,2020-02-13,86.099998,1,2020-02-13 07:25:00,86.099998,0.5,1.0,0.0,...,C,S,N,N,N,N,,1903251_163,-3.000000,-3.000000
567867,AH-64E-1903256-20200106-A-00101,1903256,2020-01-06,16.400000,1,2020-01-06 17:22:00,0.000000,0.4,1.0,0.1,...,D,S,N,N,N,N,,1903256_210,-8.700001,-8.700001


In [10]:

# Find all events where the timedelta > 0 (increased hours between events) but the sum of the last 2 = 0  (negative, then corrective positive)
pos_indexes = TN_Intervals[(TN_Intervals['dd'] == 0) & (TN_Intervals['timedelta'] > 0)].index  
#print(pos_indexes)
for idx in pos_indexes :
     TN_Intervals.loc[idx,'RELEVANT_BEG_AGE'] = TN_Intervals.loc[(idx-1),'RELEVANT_BEG_AGE']  #set n = n -1 

# Find all events where the timedelta < 0 (increased hours between events) but the sum of the last 2 = 0  (positive, then corrective negative)
neg_indexes = TN_Intervals[(TN_Intervals['dd'] == 0) & (TN_Intervals['timedelta'] < 0)].index  
#print(neg_indexes)
for idx in neg_indexes :
     TN_Intervals.loc[(idx-1),'RELEVANT_BEG_AGE'] = TN_Intervals.loc[(idx),'RELEVANT_BEG_AGE'] #set n -1 = n.

#After this, it looks like we've corrected all by about 796 entries.  As you will see, some of the intervals just 'moved', so we could rerun/recorrect, but we'll proceed for now.

In [14]:
# Let's examine a couple entries to make sure things are just weird...

TN_Intervals.iloc[568020 : 568030]    # 4 bad entries on 2/24   We can clean these up as well, but we'll let them go for now
#TN_Intervals.iloc[945: 955]   # looks like it was 2 errors 

#Iterating the above sum->diff->reindex->replace routine will eventually remove them all.


Unnamed: 0,KEY13,EI_SN,EVENT_DATE_TIME,RELEVANT_BEG_AGE,MAL_EFF,CORR_DATE_TIME,EI_CORR_AGE,TMMH,TMEN,TIMH,...,SCD4,SCD5,SCD6,SCD7,SCD8,SCD9,PRIMARY_EVENT,NewTailNumber,timedelta,dd
568020,AH-64E-1903256-20200213-W-00100,1903256,2020-02-13,48.099998,1,2020-02-13 16:24:00,48.099998,0.6,2.0,0.0,...,C,S,N,N,N,N,,1903256_210,0.0,0.0
568021,AH-64E-1903256-20200213-A-00100,1903256,2020-02-13,69.699997,1,2020-02-24 10:24:00,48.099998,0.5,1.0,0.0,...,C,S,N,N,N,N,,1903256_210,21.599998,21.599998
568022,AH-64E-1903256-20200213-A-00110,1903256,2020-02-13,69.699997,1,2020-02-24 10:22:00,48.099998,0.5,1.0,0.0,...,D,S,N,N,N,N,,1903256_210,0.0,21.599998
568023,AH-64E-1903256-20200213-A-00130,1903256,2020-02-13,69.699997,1,2020-02-18 11:57:00,48.099998,0.1,1.0,0.1,...,C,A,N,N,N,N,,1903256_210,0.0,0.0
568024,AH-64E-1903256-20200213-A-00160,1903256,2020-02-13,69.699997,1,2020-02-13 05:20:00,0.0,0.3,1.0,0.0,...,D,S,N,N,N,N,,1903256_210,0.0,0.0
568025,AH-64E-1903256-20200218-A-00100,1903256,2020-02-18,48.099998,1,2020-02-18 08:34:00,48.099998,0.3,1.0,0.2,...,D,S,N,N,N,N,,1903256_210,-21.599998,-21.599998
568026,AH-64E-1903256-20200218-W-00100,1903256,2020-02-18,48.099998,1,2020-02-19 14:07:00,48.099998,1.0,1.0,0.0,...,C,S,N,N,N,N,,1903256_210,0.0,-21.599998
568027,AH-64E-1903256-20200219-A-00100,1903256,2020-02-19,48.099998,1,2020-02-20 10:19:00,48.099998,1.0,2.0,0.0,...,C,S,N,N,N,N,AH-64E-1903256-20200218-W-00100,1903256_210,0.0,0.0
568028,AH-64E-1903256-20200220-A-00100,1903256,2020-02-20,48.099998,1,2020-02-27 09:53:00,49.099998,0.3,1.0,0.0,...,B,S,N,N,N,N,,1903256_210,0.0,0.0
568029,AH-64E-1903256-20200221-A-00100,1903256,2020-02-21,48.099998,1,2020-02-26 12:24:00,48.099998,0.3,1.0,0.0,...,B,S,N,N,N,N,,1903256_210,0.0,0.0


In [15]:
#################################################
# Add Event Class to the Dataframe
#  This process ends up copying events that fall into each eventclass (ema,ma,maf,schedmaint,uma,unschedmaint)
#################################################

TN_Intervals_EventClass = TN_Intervals.copy()
try:
   TN_Intervals_EventClass = TN_Intervals_EventClass.drop(columns = ['RELEVANT_BEG_AGE.1','Unnamed: 0'])  # store as pkl and remove extra RELEVANT_BEG_AGE from sql
except:
    print("Superfluous columns removed already")
# Set all EventClasses according to SCD Rules
# Filter and save dataframe for each EventClass
#Essential Maintenance Action
TN_Intervals_EventClass.loc[((TN_Intervals_EventClass.SCD2 != 'N') & (TN_Intervals_EventClass.SCD2 != 'P') & (TN_Intervals_EventClass.SCD2 != 'X') & (TN_Intervals_EventClass.SCD2 != 'Z') & (TN_Intervals_EventClass.SCD2 != '')) & (TN_Intervals_EventClass.SCD3 == 'C') & (TN_Intervals_EventClass.SCD8 != 'N') & (TN_Intervals_EventClass.RFG != '36B'),'EventClass'] = 'EMA'
ema = TN_Intervals_EventClass[TN_Intervals_EventClass['EventClass'] == 'EMA']

#Mission Abors
TN_Intervals_EventClass.loc[(TN_Intervals_EventClass.SCD3 == 'C') & (TN_Intervals_EventClass.SCD4 != 'O') & (TN_Intervals_EventClass.SCD4 != 'H') & ((TN_Intervals_EventClass.SCD2 == 'J') | (TN_Intervals_EventClass.SCD2 == 'K') | (TN_Intervals_EventClass.SCD2 == 'C') | (TN_Intervals_EventClass.SCD2 == 'S') | (TN_Intervals_EventClass.SCD2 == 'W') | (TN_Intervals_EventClass.SCD2 == 'Q') | (TN_Intervals_EventClass.SCD2 == 'U')) & ((TN_Intervals_EventClass.SCD5 == '1') | (TN_Intervals_EventClass.SCD5 == '2') | (TN_Intervals_EventClass.SCD5 == '4')) & (TN_Intervals_EventClass.RFG != '36B'),'EventClass'] = 'MA'
ma = TN_Intervals_EventClass[TN_Intervals_EventClass['EventClass'] == 'MA']

#Mission Affecting Failure
TN_Intervals_EventClass.loc[((TN_Intervals_EventClass.SCD2 != 'D') & (TN_Intervals_EventClass.SCD2 != 'N') & (TN_Intervals_EventClass.SCD2 != 'P') & (TN_Intervals_EventClass.SCD2 != 'X') & (TN_Intervals_EventClass.SCD2 != 'Z') & (TN_Intervals_EventClass.SCD2 != '')) & (TN_Intervals_EventClass.SCD3 == 'C') & (TN_Intervals_EventClass.SCD8 != 'N') & (TN_Intervals_EventClass.SCD9 != 'N') & (TN_Intervals_EventClass.RFG != '36B'),'EventClass'] = 'MAF'
maf = TN_Intervals_EventClass[TN_Intervals_EventClass['EventClass'] == 'MAF']

TN_Intervals_EventClass.loc[(TN_Intervals_EventClass.SCD2 != 'X') & (TN_Intervals_EventClass.SCD3 == 'C') & (TN_Intervals_EventClass.SCD5 == 'S') & (TN_Intervals_EventClass.RFG != '36B'),'EventClass'] = 'SchedMaint'
schedmaint = TN_Intervals_EventClass[TN_Intervals_EventClass['EventClass'] == 'SchedMaint']

TN_Intervals_EventClass.loc[((TN_Intervals_EventClass.SCD2 != 'X') & (TN_Intervals_EventClass.SCD2 != 'Z')) & (TN_Intervals_EventClass.SCD3 == 'C') & ((TN_Intervals_EventClass.SCD5 != 'M') & (TN_Intervals_EventClass.SCD5 != 'R') & (TN_Intervals_EventClass.SCD5 != 'S')) & (TN_Intervals_EventClass.RFG != '36B'),'EventClass'] = 'UMA'
uma = TN_Intervals_EventClass[TN_Intervals_EventClass['EventClass'] == 'UMA']

TN_Intervals_EventClass.loc[(TN_Intervals_EventClass.SCD5 != 'S') & (TN_Intervals_EventClass.SCD2 != 'X') & (TN_Intervals_EventClass.SCD3 == 'C'),'EventClass'] = 'UnschedMaint'
unschedmaint = TN_Intervals_EventClass[TN_Intervals_EventClass['EventClass'] == 'UnschedMaint']

# Concatinate all the EventClasses into a big frame and remove the old frame
# Lots of events are in several of these lists, so we end up with more data/rows
TN_Intervals_EventClass = pd.concat([ema,ma,maf,schedmaint,uma,unschedmaint],ignore_index=True)

# The following section isn't needed since these fields aren't used, but was used to validate that the above code worked the same.
TN_Intervals_EventClass['Key13 / RFG / EventClass'] = TN_Intervals_EventClass.apply(lambda y: str(y.KEY13[:15] + y.RFG + '-' + y.EventClass + y.KEY13[14:]),axis=1)
TN_Intervals_EventClass = TN_Intervals_EventClass.drop(['MAL_EFF', 'CORR_DATE_TIME', 'EI_CORR_AGE', 'in_phase', 'in_qc', 'SCD1','SCD2', 'SCD3', 'SCD4', 'SCD5', 'SCD6', 'SCD7', 'SCD8', 'SCD9', 'PRIMARY_EVENT'], axis=1)
TN_Intervals_EventClass = TN_Intervals_EventClass.sort_values(['Key13 / RFG / EventClass'])
TN_Intervals_EventClass = TN_Intervals_EventClass.reset_index(drop=True)
TN_Intervals_EventClass['lengthRFG'] = TN_Intervals_EventClass.apply(lambda y :len(y['RFG']), axis=1)
TN_Intervals_EventClass = TN_Intervals_EventClass.rename(columns={'EI_SN':'TailNumber'})

# This field is needed in the following section
TN_Intervals_EventClass['tn-class'] = TN_Intervals_EventClass['NewTailNumber'].astype(str) + "-" + TN_Intervals_EventClass['EventClass']


TN_Intervals_EventClass = TN_Intervals_EventClass[['Key13 / RFG / EventClass','KEY13','TailNumber','NewTailNumber','tn-class','EVENT_DATE_TIME','RELEVANT_BEG_AGE','TMMH','TMEN','TIMH','RFG','EventClass','lengthRFG']]
TN_Intervals_EventClass.name = "TN_Intervals_EventClass"
# Validation file save
# TN_Intervals_EventClass.to_csv(output_dir + 'AH64E_NewId_2013toPres_SCORED.csv')
save_df(TN_Intervals_EventClass)

Superfluous columns removed already


In [16]:
################################################
# Shorten RFGs
#    For each TN/EventClass/Interval do: 
#      Count RFGs
#      While Exists RFG with 1 value :
#         For each RFG with only 1 value, shorten
#  This is the long pole in the tent as it has to be shortened per-RFG/class/TN so we have an interval
#  
#  This is the longest run.  10m 11 seconds last run.

pd.options.mode.chained_assignment = None  # Supress Warnings.
Shortened_RFGs = TN_Intervals_EventClass.copy()

#  Here we take each 'Tail Number' and we look over the RFGs.
#  If we find some single RFGs, then we reduce and see if we find match 'above' us in the RFG Chain.   We continue to do this until we are either 2 digits *or* we have more than 1 (1 good interval!)
print("Total tailnumbers & classes : " + str(len(Shortened_RFGs["tn-class"].unique())))
i = 1
for tn in Shortened_RFGs["tn-class"].unique() :  
    tmpframe = Shortened_RFGs[Shortened_RFGs['tn-class']==tn]  #Create a temporary frame with just the tail number/interval I'm interested in
    # This while loop has 2 conditions.  
    #    a)  There existing at least 1 RFG in the tmpframe with only 1 entry 
    #    b)  The max length of RFG(s) with one entry is greater than 2 (if the max length was 2, then we can no longer trim)
    print ("Shortening  for TN:  " + str(tn) + " " + "loop count" + str(i))
    while ((tmpframe[tmpframe['RFG'].map(tmpframe['RFG'].value_counts()) == 1 ].shape[0] > 0) & (tmpframe[tmpframe['RFG'].map(tmpframe['RFG'].value_counts()) == 1 ]['RFG'].map(len).max() > 2 )):
        # Shorten the RFGs!
        tmpframe.loc[tmpframe['RFG'].map(tmpframe['RFG'].value_counts()) == 1, 'RFG'] = tmpframe.loc[tmpframe['RFG'].map(tmpframe['RFG'].value_counts()) == 1]['RFG'].apply(lambda y : shorten_rfg(y)) # over in helper methods
    Shortened_RFGs.update(tmpframe) #Update our Shortened RFGs
    Shortened_RFGs['tn-class'] = Shortened_RFGs['NewTailNumber'].astype(str) + "-" + Shortened_RFGs['EventClass']
    i = i + 1

#################################
#  So there is an edge case where shortened RFGs can converge to a non-existent RFG (assuming not all chains shorten the same way) however, verifying that the RFGs converge to 
#  *something* or the 2 digit starting Functional group, should make it rare if at all.  This can be changed by simply building the RFG structure and decrimenting down the tree which will be marginally slower (maybe)

save_df(Shortened_RFGs)

Total tailnumbers & classes : 1490
Shortening  for TN:  1009002_152-SchedMaint loop count1
Shortening  for TN:  1009002_152-UMA loop count2
Shortening  for TN:  1009002_152-UnschedMaint loop count3
Shortening  for TN:  1009002_152-EMA loop count4
Shortening  for TN:  1009002_152-MA loop count5
Shortening  for TN:  1009002_152-MAF loop count6
Shortening  for TN:  1009003_2-EMA loop count7
Shortening  for TN:  1009003_2-MA loop count8
Shortening  for TN:  1009003_2-MAF loop count9
Shortening  for TN:  1009003_0-SchedMaint loop count10
Shortening  for TN:  1009003_1-SchedMaint loop count11
Shortening  for TN:  1009003_2-SchedMaint loop count12
Shortening  for TN:  1009003_0-UMA loop count13
Shortening  for TN:  1009003_1-UMA loop count14
Shortening  for TN:  1009003_2-UMA loop count15
Shortening  for TN:  1009003_0-UnschedMaint loop count16
Shortening  for TN:  1009003_1-UnschedMaint loop count17
Shortening  for TN:  1009003_2-UnschedMaint loop count18
Shortening  for TN:  1009003_0-EMA l

In [17]:
# Looking at the Data generated. So we can verify it does what we think it does.
# Note now, the only classes with 1 event have been shortened all the way to 2 digits.  Everything else converged somewhere up the chain.
Shortened_RFGs[(Shortened_RFGs['tn-class']=="1009003_2-UnschedMaint")]['RFG'].value_counts()

05A01H    59
09W       26
02B22     17
02C15     17
04A       17
          ..
09C04A     2
36         1
15         1
12         1
83         1
Name: RFG, Length: 220, dtype: int64

In [18]:
####################################################
# Find the time between events *of the same class*
# Add those diff's to the frame


TimeDiffs = Shortened_RFGs.copy()

# Set an identifier for a TailNumber, RFG and EventClass combined
TimeDiffs['tn_rfg_class'] = TimeDiffs['NewTailNumber'].astype(str)+"_" + TimeDiffs['RFG'] + "_" + TimeDiffs['EventClass']
TimeDiffs.sort_values(by=['tn_rfg_class','EVENT_DATE_TIME','RELEVANT_BEG_AGE'],ignore_index=True,inplace=True) 

# Groupby the Identifier and then take the time differences.  
TimeDiffs['TimeSinceLast'] = TimeDiffs.groupby(['tn_rfg_class'])['RELEVANT_BEG_AGE'].diff()  # Now that they're grouped by TN/RFG/CLASS and sorted by Date/Hours... I can find the difference intervals between those events
save_df(TimeDiffs)

In [None]:
# Debugging/validation section. Uncomment/change one at a time to take a look/browse the data.
# 
# Random TailNumber
TimeDiffs[TimeDiffs['TailNumber'].astype(str) == '1009002']

# Random EventClass
# TimeDiffs[(TimeDiffs['TailNumber'].astype(str) == '1009002') & (TimeDiffs['EventClass'] == 'UnschedMaint')]

# Random RFG
# TimeDiffs[(TimeDiffs['TailNumber'].astype(str) == '1009002') & (TimeDiffs['EventClass'] == 'UnschedMaint') & (TimeDiffs['RFG'] == '02A15C')]

# Also note you could have just
# TimeDiffs[TimeDiffs['tn_rfg_class'] == '1009002_152_02A15C_UnschedMaint']

# The Math checks out...  (TimeSinceLast is accurate)
# Note the dups (the Zeros...... )

# Also note Negatives :/  
# TimeDiffs[TimeDiffs['TimeSinceLast']<0]

# Pick one  
# TimeDiffs[TimeDiffs['tn_rfg_class'] == '1903256_210_09A01_SchedMaint']  #Ugh.  But we'll deal and move on.



In [37]:
###########################
#   Let's filter out the start times and 'mistake' times (where time is negative)  (As we can tell above this is going to give 'some' bad data.  We can fix this)

Weibulldata = TimeDiffs.copy()

#Weibulldata = Weibulldata.reset_index(drop=True)  # just keep mashing that button.
Weibulldata = Weibulldata[(Weibulldata['TimeSinceLast'].notnull()) & (Weibulldata['TimeSinceLast'] > 0)]  # ouch.. just included the outliers
Weibulldata = Weibulldata.reset_index(drop=True)


# Here is where we start bringing together the RFG/Classes so now we can see the size of the datasets
Weibulldata['rfg-class'] = Weibulldata['RFG'].astype(str) + "_" + Weibulldata['EventClass'].astype(str)


#Weibulldata = Weibulldata[Weibulldata['rfg-class'].map(Weibulldata['rfg-class'].value_counts()) >= 5] 
sample = Weibulldata['rfg-class'].value_counts()
Weibulldata = Weibulldata.loc[Weibulldata['rfg-class'].isin(sample[sample.gt(4)].index)] # Greater than or equal to 5

#Weibulldata = Weibulldata[Weibulldata['TimeSinceLast'] > 0] # Thinking this isn't the best answer but it cleans the data some
Weibulldata = Weibulldata.reset_index(drop=True)
Weibulldata['rfg-class'].value_counts() # Tada

00_SchedMaint         22193
05A02_SchedMaint      10360
16S_SchedMaint         7995
18A02_SchedMaint       7819
04A_SchedMaint         6617
                      ...  
15C07_UMA                 5
15C07_UnschedMaint        5
05B01_EMA                 5
19D02_UMA                 5
17_EMA                    5
Name: rfg-class, Length: 1591, dtype: int64

In [48]:
# Examine just a bit
#
# Random RFG-Class
Weibulldata[Weibulldata['rfg-class'] == '04A_SchedMaint'] 

# Pick one of the events and map it back to the helicopter data
TimeDiffs[TimeDiffs['tn_rfg_class'] == '1009002_152_04A_SchedMaint']

# Take it all the way back to ALL_DATA
#ALL_DATA[(ALL_DATA['EI_SN']=='1009002') & (ALL_DATA['RFG'] == '04A')]




Unnamed: 0,Key13 / RFG / EventClass,KEY13,TailNumber,NewTailNumber,tn-class,EVENT_DATE_TIME,RELEVANT_BEG_AGE,TMMH,TMEN,TIMH,RFG,EventClass,lengthRFG,tn_rfg_class,TimeSinceLast
2563,AH-64E-1009002-04A-SchedMaint-20170104-A-00002,AH-64E-1009002-20170104-A-00002,1009002,1009002_152,1009002_152-SchedMaint,2017-01-04,5631.200195,0.1,1.0,0.0,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,
2564,AH-64E-1009002-04A-SchedMaint-20170117-A-00001,AH-64E-1009002-20170117-A-00001,1009002,1009002_152,1009002_152-SchedMaint,2017-01-17,5639.600098,0.1,1.0,0.0,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,8.399902
2565,AH-64E-1009002-04A-SchedMaint-20170127-A-00002,AH-64E-1009002-20170127-A-00002,1009002,1009002_152,1009002_152-SchedMaint,2017-01-27,5646.299805,1.0,1.0,0.1,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,6.699707
2566,AH-64E-1009002-04A-SchedMaint-20170209-A-00001,AH-64E-1009002-20170209-A-00001,1009002,1009002_152,1009002_152-SchedMaint,2017-02-09,5647.100098,1.0,1.0,0.0,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,0.800293
2567,AH-64E-1009002-04A-SchedMaint-20170318-A-00002,AH-64E-1009002-20170318-A-00002,1009002,1009002_152,1009002_152-SchedMaint,2017-03-18,5683.700195,0.5,1.0,0.0,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,36.600098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2673,AH-64E-1009002-04A-SchedMaint-20210205-A-00100,AH-64E-1009002-20210205-A-00100,1009002,1009002_152,1009002_152-SchedMaint,2021-02-05,6206.799805,3.0,1.0,0.0,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,3.099609
2674,AH-64E-1009002-04A-SchedMaint-20210221-A-00100,AH-64E-1009002-20210221-A-00100,1009002,1009002_152,1009002_152-SchedMaint,2021-02-21,6206.799805,0.3,1.0,0.3,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,0.000000
2675,AH-64E-1009002-04A-SchedMaint-20210306-A-00200,AH-64E-1009002-20210306-A-00200,1009002,1009002_152,1009002_152-SchedMaint,2021-03-06,6209.299805,0.3,1.0,0.0,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,2.500000
2676,AH-64E-1009002-04A-SchedMaint-20210318-A-00110,AH-64E-1009002-20210318-A-00110,1009002,1009002_152,1009002_152-SchedMaint,2021-03-18,6228.399902,0.2,1.0,0.0,04A,SchedMaint,3.0,1009002_152_04A_SchedMaint,19.100098


In [44]:
############################################################
# Add Some stats!
###########################################################

lstSHAPE = []       # Initialize the list
lstLOCATION = []    # Initialize the list
lstSCALE = []       # Initialize the list
lstMEAN = []        # Initialize the list
lstVAR = []         # Initialize the list
lstSTDEV = []       # Initialize the list
lstDelete = []      # Initialize the list
lstDiff = []        # Initialize the list
lstStat_Pearson = []        # Initialize the list
lstPVal_Pearson = []        # Initialize the listde
lstStat_LogLike = []        # Initialize the list
lstPVal_LogLike = []        # Initialize the list
lstStat_FreeTuk = []        # Initialize the list
lstPVal_FreeTuk = []        # Initialize the list
lstStat_ModLogLike = []        # Initialize the list
lstPVal_ModLogLike = []        # Initialize the list
lstStat_Neyman = []        # Initialize the list
lstPVal_Neyman = []        # Initialize the list
lstStat_CRead = []        # Initialize the list
lstPVal_CRead = []        # Initialize the list
datapoints = []  
lstrfg_class = []   
WeibullDataCount = pd.DataFrame()
round = 1
print ("There will be " + str(len(Weibulldata['rfg-class'].unique())) + ' rounds' )
for rfg_class in Weibulldata['rfg-class'].unique():
    print('Round: ' + str(round))
    round = round + 1
    lstDiff = Weibulldata[Weibulldata['rfg-class']==rfg_class]['TimeSinceLast'].values 
    # print(lstDiff)
    # print(lstDiff.size)
    datapoints.append(lstDiff.size)
    shape, loc, scale = weibull_min.fit(lstDiff)
    Mean, Variance = weibull_min.stats(shape, scale=scale, moments='mv')
    StDev = weibull_min.std(shape, scale=scale)
    Pearson = power_divergence(lstDiff, lambda_='pearson')
    LogLike = power_divergence(lstDiff, lambda_='log-likelihood')
    FreeTuk = power_divergence(lstDiff, lambda_='freeman-tukey')
    ModLogLike = power_divergence(lstDiff, lambda_='mod-log-likelihood')
    Neyman = power_divergence(lstDiff, lambda_='neyman')
    CRead = power_divergence(lstDiff, lambda_='cressie-read')
    lstrfg_class.append(rfg_class)
    lstSHAPE.append(shape)
    lstLOCATION.append(loc)
    lstSCALE.append(scale)
    lstMEAN.append(Mean)
    lstVAR.append(Variance)
    lstSTDEV.append(StDev)
    lstStat_Pearson.append(Pearson[0])
    lstPVal_Pearson.append(Pearson[1])
    lstStat_LogLike.append(LogLike[0])
    lstPVal_LogLike.append(LogLike[1])
    lstStat_FreeTuk.append(FreeTuk[0])
    lstPVal_FreeTuk.append(FreeTuk[1])
    lstStat_ModLogLike.append(ModLogLike[0])
    lstPVal_ModLogLike.append(ModLogLike[1])
    lstStat_Neyman.append(Neyman[0])
    lstPVal_Neyman.append(Neyman[1])
    lstStat_CRead.append(CRead[0])
    lstPVal_CRead.append(CRead[1])
WeibullDataCount['RFG-Class'] = lstrfg_class
WeibullDataCount['Datapoints'] = datapoints
WeibullDataCount['Shape'] = lstSHAPE
WeibullDataCount['Location'] = lstLOCATION
WeibullDataCount['Scale'] = lstSCALE
WeibullDataCount['Mean'] = lstMEAN
WeibullDataCount['Variance'] = lstVAR
WeibullDataCount['St. Dev.'] = lstSTDEV
#WeibullDataCount['Zero Count'] = listZeroCounts     I can add this back in but getting this out there for the time being.
WeibullDataCount['Pearson_CRstat'] = lstStat_Pearson
WeibullDataCount['Pearson_Pvalue'] = lstPVal_Pearson
WeibullDataCount['LogLikelihood_CRstat'] = lstStat_LogLike
WeibullDataCount['LogLikelihood_Pvalue'] = lstPVal_LogLike
WeibullDataCount['FreemanTukey_CRstat'] = lstStat_FreeTuk
WeibullDataCount['FreemanTukey_Pvalue'] = lstPVal_FreeTuk
WeibullDataCount['ModLogLikelihood_CRstat'] = lstStat_ModLogLike
WeibullDataCount['ModLogLikelihood_Pvalue'] = lstPVal_ModLogLike
WeibullDataCount['Neyman_CRstat'] = lstStat_Neyman
WeibullDataCount['Neyman_Pvalue'] = lstPVal_Neyman
WeibullDataCount['CressieRead_CRstat'] = lstStat_CRead
WeibullDataCount['CressieRead_Pvalue'] = lstPVal_CRead
#WeibullDataCount = WeibullDataCount.drop(WeibullParams.index[lstDelete]) # This (hopefully) will do nothing, due to data preprocessing
WeibullDataCount = WeibullDataCount.sort_values(['RFG-Class'])
WeibullDataCount = WeibullDataCount.reset_index(drop=True)
WeibullDataCount.name = 'WeibullDataCount'
save_df(WeibullDataCount)

There will be 1591 rounds
Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Round: 40
Round: 41
Round: 42
Round: 43
Round: 44
Round: 45
Round: 46
Round: 47
Round: 48
Round: 49
Round: 50
Round: 51
Round: 52
Round: 53
Round: 54
Round: 55
Round: 56
Round: 57
Round: 58
Round: 59
Round: 60
Round: 61
Round: 62
Round: 63
Round: 64
Round: 65
Round: 66
Round: 67
Round: 68
Round: 69
Round: 70
Round: 71
Round: 72
Round: 73
Round: 74
Round: 75
Round: 76
Round: 77
Round: 78
Round: 79
Round: 80
Round: 81
Round: 82
Round: 83
Round: 84
Round: 85
Round: 86
Round: 87
Round: 88
Round: 89
Round: 90
Round: 91
Round: 92
Round: 93
Round: 94
Round: 95
Round: 96
Round: 97
Round: 98
Rou

In [104]:
WeibullDataCount.head(15)

Unnamed: 0,RFG-Class,Datapoints,Shape,Location,Scale,Mean,Variance,St. Dev.,Pearson_CRstat,Pearson_Pvalue,LogLikelihood_CRstat,LogLikelihood_Pvalue,FreemanTukey_CRstat,FreemanTukey_Pvalue,ModLogLikelihood_CRstat,ModLogLikelihood_Pvalue,Neyman_CRstat,Neyman_Pvalue,CressieRead_CRstat,CressieRead_Pvalue
0,00_EMA,96,0.823802,5.0,94.863585,105.31962455506394,16548.356525509884,128.640416,13393.79,0.0,10778.495616,0.0,11223.222045,0.0,13132.945928,0.0,27454.855717,0.0,11959.97,0.0
1,00_SchedMaint,22193,0.990323,0.019958,3.80786,3.82374066868372,14.908528149449294,3.861156,8943086.0,0.0,117293.38512,0.0,62395.746893,0.0,54674.143633,0.0,76588.79387,0.0,1439260.0,0.0
2,00_UMA,535,0.82374,0.5,57.058191,63.35040095502978,5988.303925578744,77.384132,55722.97,0.0,41281.268885,0.0,42155.853695,0.0,50015.455017,0.0,144968.846707,0.0,48180.99,0.0
3,00_UnschedMaint,535,0.82374,0.5,57.058191,63.35040095502978,5988.303925578744,77.384132,55722.97,0.0,41281.268885,0.0,42155.853695,0.0,50015.455017,0.0,144968.846707,0.0,48180.99,0.0
4,02A01C02_SchedMaint,6,0.833811,2.0,8.376116,9.22537524804229,123.74466423244874,11.124058,39.61576,1.784934e-07,38.263364,3.340378e-07,39.924227,1.546841e-07,43.7481,2.605591e-08,62.753307,3.274599e-12,38.54838,2.927484e-07
5,02A01C02_UMA,6,0.283532,26.599976,79.464027,959.6360784820848,34562860.36544973,5879.018657,297.4784,3.4895590000000003e-62,288.776009,2.589707e-60,298.008098,2.68466e-62,318.08963,1.289576e-66,403.580355,5.0185409999999993e-85,290.764,9.682911e-61
6,02A01C02_UnschedMaint,6,0.283532,26.599976,79.464027,959.6360784820848,34562860.36544973,5879.018657,297.4784,3.4895590000000003e-62,288.776009,2.589707e-60,298.008098,2.68466e-62,318.08963,1.289576e-66,403.580355,5.0185409999999993e-85,290.764,9.682911e-61
7,02A01_UMA,8,0.481531,3.0,7.028999,15.106543515937483,1269.6394952905043,35.632001,196.5958,6.034135e-39,208.150262,2.152745e-41,240.445428,2.987983e-48,308.735114,8.239108e-63,750.221516,1.019029e-157,194.85,1.4129769999999998e-38
8,02A01_UnschedMaint,8,0.481531,3.0,7.028999,15.106543515937483,1269.6394952905043,35.632001,196.5958,6.034135e-39,208.150262,2.152745e-41,240.445428,2.987983e-48,308.735114,8.239108e-63,750.221516,1.019029e-157,194.85,1.4129769999999998e-38
9,02A02B04_UMA,9,0.321126,3.599609,35.183802,244.0620104527946,1312084.1413857865,1145.462414,1530.277,0.0,1305.736085,1.353113e-276,1409.923488,4.0476190000000004e-299,1852.696108,0.0,8776.537409,0.0,1402.893,1.340749e-297


In [50]:

Weibulldata['rfg-class'] = Weibulldata['RFG'].astype(str) + "_" + Weibulldata['EventClass'].astype(str)
Weibulldata[Weibulldata['rfg-class'] == '02A01C02_UMA']
#TN_Intervals[TN_Intervals['EI_SN'] == 1603086]

Unnamed: 0,Key13 / RFG / EventClass,KEY13,TailNumber,NewTailNumber,tn-class,EVENT_DATE_TIME,RELEVANT_BEG_AGE,TMMH,TMEN,TIMH,RFG,EventClass,lengthRFG,tn_rfg_class,TimeSinceLast,rfg-class
13451,AH-64E-1009005-02A01C02-UMA-20160328-A-00006,AH-64E-1009005-20160328-A-00006,1009005,1009005_7,1009005_7-UMA,2016-03-28,4682.5,2.0,1.0,0.1,02A01C02,UMA,8.0,1009005_7_02A01C02_UMA,166.100098,02A01C02_UMA
69781,AH-64E-1109020-02A01C02-UMA-20180905-A-00025,AH-64E-1109020-20180905-A-00025,1109020,1109020_42,1109020_42-UMA,2018-09-05,6539.5,0.2,1.0,0.0,02A01C02,UMA,8.0,1109020_42_02A01C02_UMA,32.200195,02A01C02_UMA
136546,AH-64E-1603084-02A01C02-UMA-20200526-A-00250,AH-64E-1603084-20200526-A-00250,1603084,1603084_77,1603084_77-UMA,2020-05-26,774.0,4.0,4.0,1.0,02A01C02,UMA,8.0,1603084_77_02A01C02_UMA,197.599976,02A01C02_UMA
144963,AH-64E-1603090-02A01C02-UMA-20200202-A-00130,AH-64E-1603090-20200202-A-00130,1603090,1603090_87,1603090_87-UMA,2020-02-02,883.200012,0.2,1.0,0.0,02A01C02,UMA,8.0,1603090_87_02A01C02_UMA,77.299988,02A01C02_UMA
162634,AH-64E-1603101-02A01C02-UMA-20200303-A-00200,AH-64E-1603101-20200303-A-00200,1603101,1603101_95,1603101_95-UMA,2020-03-03,784.299988,4.0,1.0,1.0,02A01C02,UMA,8.0,1603101_95_02A01C02_UMA,44.299988,02A01C02_UMA
172610,AH-64E-1603112-02A01C02-UMA-20200331-A-00310,AH-64E-1603112-20200331-A-00310,1603112,1603112_167,1603112_167-UMA,2020-03-31,484.799988,0.1,1.0,0.2,02A01C02,UMA,8.0,1603112_167_02A01C02_UMA,26.599976,02A01C02_UMA


In [55]:
lstDiff = Weibulldata[Weibulldata['rfg-class']=='02A01C02_UMA']['TimeSinceLast'].values
print(weibull_min.fit(lstDiff))
print(weibull_min.fit(lstDiff, loc=0, method="MLE"))

(0.28353150373941416, 26.599975585937496, 79.46402743869263)
(0.6104557313290632, 26.599975585937493, 59.49420299160256)
