# NSL-KDD - Pré-Processamento

## Importando Bibliotecas

In [73]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

## Carregando os Dados

In [74]:
nslkdd =  pd.read_csv('../datasets-sem-tratamento/nsl_kdd.csv')
train = pd.read_csv('../datasets-sem-tratamento/nsl_kdd_train.csv')
test1 =  pd.read_csv('../datasets-sem-tratamento/nsl_kdd_test1.csv')
test21 =  pd.read_csv('../datasets-sem-tratamento/nsl_kdd_test21.csv')

### Definir se vai ser usado o conjunto completo ou a divisão já feita entre treino e teste

In [75]:
#datasets = [nslkdd]
datasets = [train, test1, test21]

In [76]:
# Concatenando-os em apenas um dataframe
df = pd.concat(datasets, ignore_index=True)

In [77]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160362,0,udp,domain_u,SF,43,43,0,0,0,0,...,1.00,0.00,0.01,0.00,0.00,0.00,0.00,0.00,normal,18
160363,0,tcp,http,SF,336,285,0,0,0,0,...,0.92,0.02,0.00,0.00,0.00,0.00,0.05,0.00,normal,18
160364,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.37,0.03,0.01,0.02,0.05,0.08,0.85,0.58,mscan,13
160365,0,tcp,sunrpc,REJ,0,0,0,0,0,0,...,0.19,0.03,0.01,0.04,0.00,0.00,0.88,1.00,mscan,15


## Pré-Processamento

### Mapeamento dos ataques em grupos

In [78]:
def cria_grupo(df):
    # Define as regras para criar a coluna "grupo"
    regras = {
        'dos': ['apache2', 'back', 'land', 'mailbomb', 'neptune', 'pod', 'processtable', 'smurf', 'teardrop', 'udpstorm'],
        'normal': ['normal'],
        'probe': ['mscan', 'satan', 'portsweep', 'ipsweep', 'nmap','saint'],
        'r2l': ['guess_passwd', 'warezmaster', 'snmpguess', 'snmpgetattack', 'httptunnel', 'multihop', 'named', 'sendmail', 'xlock', 'xsnoop', 'ftp_write', 'worm', 'phf', 'imap', 'spy', 'warezclient'],
        'u2r': ['buffer_overflow', 'ps', 'rootkit', 'xterm', 'loadmodule', 'perl', 'sqlattack']
    }

    # Mapeia a coluna "attack" para a coluna "grupo" usando as regras
    df['grupo'] = df['label'].map({valor: chave for chave, valores in regras.items() for valor in valores})

    # Exibe o DataFrame resultante
    return df

In [79]:
# Aplicação da função no conjunto de dados
df = cria_grupo(df)
df.drop(['label'],axis=1,inplace=True)

### Tratamento de colunas categóricas

In [80]:
# Lista de colunas categóricas
categorical_columns = ['protocol_type', 'service', 'flag']

# Criando um OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Ajustando e transformando as colunas categóricas usando o OneHotEncoder
encoded_data = encoder.fit_transform(df[categorical_columns])

# Criando um DataFrame com as novas colunas codificadas
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenando as novas colunas codificadas com as colunas numéricas originais
nslkdd_encoded = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)



### Remoção de atributos irrelevantes

In [81]:
nslkdd_encoded.drop(['num_outbound_cmds'],axis=1,inplace=True)
nslkdd_encoded.drop(['difficulty'],axis=1,inplace=True)

### Posicionando a coluna referente às classes como a última coluna do conjunto de dados

In [82]:
# Remova a coluna do DataFrame
coluna_removida = nslkdd_encoded.pop('grupo')

# Adicione a coluna removida no final do DataFrame
nslkdd_encoded['grupo'] = coluna_removida

In [83]:
nslkdd_encoded

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,grupo
0,0,491,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,normal
1,0,146,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,normal
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,dos
3,0,232,8153,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,normal
4,0,199,420,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160362,0,43,43,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,normal
160363,0,336,285,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,normal
160364,1,0,15,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,probe
160365,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,probe


### Gerando um arquivo para o arquivo pré-processado

In [84]:
if len(datasets) == 1:
    df.to_csv("../datasets-tratados/nsl_kdd_exp1_processed.csv", index=False)
elif len(datasets) == 3:    
    df.to_csv("../datasets-tratados/nsl_kdd_exp2_processed.csv", index=False)
else:
    print('Erro')

### Normalização

In [85]:
# Separando atributos e rótulos
X = nslkdd_encoded.drop('grupo', axis=1)
y = nslkdd_encoded['grupo']

# Inicializando o MinMaxScaler
scaler = MinMaxScaler()

# Ajustando o scaler aos dados e transformando os dados
dados_normalizados = scaler.fit_transform(X)

# Transformando em DataFrame
df_normalizado = pd.DataFrame(dados_normalizados, columns=X.columns)

df_normalizado['grupo'] = df['grupo']

### Gerando um arquivo para o dataset normalizado

In [86]:
if len(datasets) == 1:
    df_normalizado.to_csv("../datasets-tratados/nsl_kdd_exp1_normalized.csv", index=False)
elif len(datasets) == 3:    
    df_normalizado.to_csv("../datasets-tratados/nsl_kdd_exp2_normalized.csv", index=False)
else:
    print('Erro')

## Treino e Teste

In [87]:
if len(datasets) == 1:
    train, test = train_test_split(df_normalizado, test_size=0.3, random_state=42)
elif len(datasets) == 3:    
    train_encoded = df_normalizado[:len(datasets[0])]
    test1_encoded = df_normalizado[len(datasets[0]):len(datasets[0]) + len(datasets[1])]
    test2_encoded = df_normalizado[len(datasets[0]) + len(datasets[1]):]
else:
    print('Erro')

## Criando os arquivos .csv para os datasets

In [88]:
if len(datasets) == 1:
    train.to_csv("../datasets-tratados/nsl_kdd_exp1_train.csv", index=False)
    test.to_csv("../datasets-tratados/nsl_kdd_exp1_test.csv", index=False)
elif len(datasets) == 3:    
    train_encoded.to_csv("../datasets-tratados/nsl_kdd_exp2_train.csv", index=False)
    test1_encoded.to_csv("../datasets-tratados/nsl_kdd_exp2_test1.csv", index=False)
    test2_encoded.to_csv("../datasets-tratados/nsl_kdd_exp2_test21.csv", index=False)
else:
    print('Erro')