# Importing bibs

In [1]:
import json
import pandas as pd
from scipy import sparse

import sys
sys.path.append('../')
from clustering_logs.log2matrix import create_binary_matrix, create_tf_matrix, create_tfidf_matrix

# Defining functions

In [2]:
def get_and_save_matrices(representations, selections, log):
    for representation in representations:
        for selection in selections:
            print('\n\n------\nIniciando processo para:', representation, selection)
            current = (log[['number'] + selections[selection]]).dropna()
            representations_matrices = create_and_save_matrices(current, representation, selections[selection], 
                                                                'number', 'feature', 
                                                                'matrices/%s_%s_'%(representation, selection))

    
def create_and_save_matrices(df, representation, cols, index_col, feature_col, filename):
    def get_filename(counting):
        return '%s%s.csv' %(filename, counting)
    
    def save(df, filename):
        if df.shape[1] > 1000:
            print('*too big, saving sparse matrix...')
            df_columns = list(df.columns)
            df_rows_ids = list(df.index)
            #df_sparse = sparse.csr_matrix(df.values)
            #sparse.save_npz(filename.replace('.csv', '.npz'), df_sparse)
            with open(filename.replace('.csv', '-cols.json'), 'w') as f:
                json.dump(df_columns,f)
            with open(filename.replace('.csv', '-rows.json'), 'w') as f:
                json.dump(df_rows_ids,f)
        else:
            df.to_csv(filename)


    if representation == 'individual':
        df = get_individual_val_repres(df, cols, index_col)
    elif representation == 'combined':
        df = get_combined_val_repres(df, cols, index_col)
    
    print('\nRepresentacoes criadas, iniciando as matrizes...')
    
    
    matrix = create_binary_matrix(df, index_col, feature_col)
    print('Binaria criada (shape %s), salvando...'%str(matrix.shape))
    save(matrix, get_filename('binary'))
    print('Salva!')
    matrix = create_tf_matrix(df, index_col, feature_col)
    print('TF criada (shape %s), salvando...'%str(matrix.shape))
    save(matrix, get_filename('tf'))
    print('Salva!')  
    matrix = create_tfidf_matrix(matrix)
    print('TDIDF criada (shape %s), salvando...'%str(matrix.shape))
    save(matrix, get_filename('tfidf'))
    print('Salva!')
    

def get_combined_val_repres(df, cols, index_col):
    
    def add_col_name(x):
        aux = x.index+'-' + x.astype(str)
        x['feature'] = '--'.join(aux)
        return x
    
    
    df = (df.set_index(index_col)
            .apply(lambda x: add_col_name(x), axis = 1)
            .reset_index())
    return df[[index_col, 'feature']]

def get_individual_val_repres(df, cols, index_col):
    df_melt = pd.melt(df, id_vars=index_col, value_vars=cols)#.dropna()
    df_melt['feature'] = df_melt[['variable', 'value']].astype(str).apply(lambda x: '-'.join(x), axis=1)
    return df_melt.drop(columns=['variable', 'value'])

# Loading data

In [3]:
log = pd.read_csv("../datasets/incidentLog/incident_evt_log-processed1-withdurations.csv")
log.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,...,notify,problem_id,rfc,vendor,caused_by,close_code,resolved_by,resolved_at,closed_at,duration
0,INC0000045,New,True,0,0,0,True,Caller 2222,Opener 6,2016-02-29 01:16:00,...,Do Not Notify,,,,,"Service is stable, ok to close ticket",Resolver 137,29/2/16 11:29,2016-03-05 12:00:00,470640.0
1,INC0000045,Resolved,True,0,0,2,True,Caller 2222,Opener 6,2016-02-29 01:16:00,...,Do Not Notify,,,,,"Service is stable, ok to close ticket",Resolver 137,29/2/16 11:29,2016-03-05 12:00:00,470640.0
2,INC0000045,Resolved,True,0,0,3,True,Caller 2222,Opener 6,2016-02-29 01:16:00,...,Do Not Notify,,,,,"Service is stable, ok to close ticket",Resolver 137,29/2/16 11:29,2016-03-05 12:00:00,470640.0
3,INC0000045,Closed,False,0,0,4,True,Caller 2222,Opener 6,2016-02-29 01:16:00,...,Do Not Notify,,,,,"Service is stable, ok to close ticket",Resolver 137,29/2/16 11:29,2016-03-05 12:00:00,470640.0
4,INC0000047,New,True,0,0,0,True,Caller 2222,Opener 152,2016-02-29 04:40:00,...,Do Not Notify,,,,,"Service is stable, ok to close ticket",Resolver 72,1/3/16 9:52,2016-03-06 10:00:00,537600.0


# Get matrices to all combination

In [14]:
representations = ['combined'] #['individual', 'combined']
selections={#'specialist': ['incident_state', 'category', 'priority'], 
            'alg1': ['caller_id', 'assigned_to']}#, 
            #'alg2': ['incident_state', 'location']}
#matrices = get_and_save_matrices(representations, selections, log)
get_and_save_matrices(representations, selections, log)



------
Iniciando processo para: combined alg1

Representacoes criadas, iniciando as matrizes...
Binaria criada (shape (24255, 20356)), salvando...
*too big, saving sparse matrix...
Salva!
TF criada (shape (24255, 20356)), salvando...
*too big, saving sparse matrix...
Salva!
TDIDF criada (shape (24255, 20356)), salvando...
*too big, saving sparse matrix...
Salva!


In [None]:
### Checking the qtt of cases for each atribute selection
sizes = {}
for matrix in matrices:
    log_vector = read_matrix(matrices_path + matrix, index_col='number')
    sizes[matrix] = log_vector.drop_duplicates().shape[0]
sizes

## Creates transitions representation 
Mapping what changes from one entry line of an incident to the next entry line of that same incident

In [55]:
grouped = (log1.melt(id_vars=['index','number','sys_updated_at', 'sys_updated_by'])
              .groupby(['number','variable','value']))

In [None]:
grouped = (log1.melt(id_vars=['index','number','sys_updated_at', 'sys_updated_by'])
              .groupby(['number']))
log_transitions = pd.DataFrame({'index': [], 'number': [], 'sys_updated_at': [],'sys_updated_by': [],
                                'variable': [],'value': []})
for number in list(log1['number']):
    group_transitions = grouped.get_group(number).drop_duplicates(['variable','value'])
    log_transitions = pd.concat([log_transitions, group_transitions]) 
    
log_transitions.sample(50)

## Creating combined representation with all columns

In [None]:
log1 = log1.reset_index()
log1_without_open_close = log1.drop(columns=['opened_at','sys_created_at','sys_updated_at','resolved_at','closed_at'])
df = get_combined_val_repres(log1_without_open_close, log1.columns,'number')
df.to_csv('logs/combined_all.csv')
# len(df['number'].unique()) #spec=716 cases, alg1=24117, alg2=1081, all(-ts)=141712

In [None]:
matrix_bin = log2matrix.create_binary_matrix(df,'number',0)
matrix_bin.to_csv('matrices/combined_all.csv')
print(matrix_bin.reset_index())
print(matrix_bin.shape)

In [None]:
matrix_bin.drop_duplicates().shape

### Feature selection based on qtt of traces in which the featured has been found
Excluding too common or too rare features

In [None]:
act_traces_qtt = log1_binary.sum()
act_traces_qtt.shape

In [None]:
# cortando as atividades que aparecem em menos de 1% das traces
boxplot = plt.boxplot(act_traces_qtt[act_traces_qtt > 100])
plt.show()

In [None]:
[item.get_ydata()[1] for item in boxplot['whiskers']]

In [None]:
act_traces_qtt = act_traces_qtt[act_traces_qtt > 100]
act_traces_qtt = act_traces_qtt[act_traces_qtt < 1500]
useless_features = [col for col in log1_binary.columns if col not in list(act_traces_qtt.index)]
log1_binary_filtered = log1_binary.drop(columns=useless_features)
list(log1_binary_filtered.columns)