# **Preprocesado de datos**

## Importación de librerías

In [None]:
!pip install pandas
!pip install numpy
import pandas as pd
import numpy as np

## Importación del dataset

In [None]:
df = pd.read_csv("captura_tcp_curso5g.csv")

## Limpieza de datos

### Eliminación de flujos TCP internos

In [None]:
filtro = df['s_ip:15'].str.startswith('10.100.200.')
df = df[~filtro]

### Etiquetado de datos

In [None]:
df.loc[:,'label'] = 0

condition = (df['#15#c_ip:1'] == '10.100.200.6') & (df['s_port:16'] == 5555)

df.loc[condition, 'label'] = 1

df

### Selección de métricas relevantes

In [None]:
#metricas = ['c_pkts_all:3', 'c_ack_cnt:5', 'c_bytes_uniq:7', 'c_pkts_data:8', 'c_bytes_all:9', 's_pkts_all:17',	's_ack_cnt:19',	's_bytes_uniq:21',	's_pkts_data:22',	's_bytes_all:23', 'label']
#metricas = ['s_bytes_uniq:21',	's_pkts_data:22',	's_bytes_all:23', 'label']
metricas = ['c_pkts_all:3', 'c_ack_cnt:5', 's_pkts_all:17',	's_ack_cnt:19', 'label']

df=df[metricas]

df.dtypes

# Separación de variables independientes y variable dependiente

In [None]:
X = df.iloc[:,:-1].values
y = df.iloc[:, -1].values

# Separación del dataset en Training Set y Test Set

In [None]:
!pip install scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)

In [None]:
print(X_train)

In [None]:
print(y_train)

In [None]:
print(X_test)

In [None]:
print(y_test)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)

# Reducción dimensional

In [None]:
#from sklearn.decomposition import PCA
#pca = PCA(n_components = 2)
#X_train = pca.fit_transform(X_train)
#X_test = pca.transform(X_test)

# Entrenamiento del modelo con el Training Set

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, max_depth= None,criterion = 'gini', random_state = 0)
#from sklearn.tree import DecisionTreeClassifier
#classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

# Predicción de resultados en el Test Set

In [None]:
y_pred = classifier.predict(X_test)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# Análisis de resultados

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Exactitud: ')
print(accuracy_score(y_test, y_pred))
print('Sensibilidad (TP Rate o Recall): ')
print(recall_score(y_test, y_pred))
print('Especificidad (TN Rate): ')
print(recall_score(y_test, y_pred, pos_label=0))
print('Puntaje F1: ')
print(f1_score(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
feature_importance = classifier.feature_importances_

importance_df = pd.DataFrame({
    'Feature': df.columns[:-1],
    'Importance': feature_importance
})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

# Visualización de resultados

In [None]:
!pip install matplotlib
from sklearn import tree
import matplotlib.pyplot as plt

first_tree = classifier.estimators_[1]

plt.figure(figsize=(8, 8))
tree.plot_tree(first_tree, feature_names=df.columns.tolist(), class_names=['Normal', 'Cryptomining'], filled=True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'blue'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()