# KMeans Prática - Segurança de Rede


Um IP malicioso é um ‘bot’ que fica tentando encontrar vulnerabilidades dentro do seu provedor de internet. Esses bots podem invadir roteadores dos seus clientes e distribuir a internet deles, podem invadir câmeras de segurança, usar a sua rede para minerar criptomoedas como o bitcoin e etc. Enfim, esses bots são utilizados para a prática de crimes cybernéticos.

#### IP malicioso: 70.32.104.50
#### Que outros IPs são maliciosos?

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics import silhouette_samples, silhouette_score

### Pré-processando

In [27]:
# caminho dos dados
data_path = "./data"

In [28]:
# 
prevectors = {}

for path in os.listdir(data_path)[:200]:
    full_path = os.path.join(data_path, path)
    with open(full_path) as f:
        for line in f:
            ip = line.split()[0]
            request_type = line.split('"')[1].split()[0]
            response_code = line.split('"')[2].split()[0]
            
            if ip not in prevectors:
                prevectors[ip] = {"requests":{}, "responses": {}}
                
            if request_type not in prevectors[ip]["requests"]:
                prevectors[ip]["requests"][request_type] = 0
            
            prevectors[ip]["requests"][request_type] += 1
        
            if response_code not in prevectors[ip]["responses"]:
                prevectors[ip]["responses"][response_code] = 0
            
            prevectors[ip]["responses"][response_code] += 1

In [29]:
request_types = ["GET", "POST", "HEAD", "OPTIONS", "PUT", "TRACE"]
response_codes = ["200", "404", "403", "304", "301", "206", "418", "416", "403", "405", "503", "500"]

In [30]:
ips = []
data = np.zeros((len(prevectors.keys()), len(request_types) + len(response_codes)))
for index, (keys,values) in enumerate(prevectors.items()):
    ips.append(keys)
    for ri,r in enumerate(request_types):
        if r in values["requests"]:
            data[index, ri] = values["requests"][r]
    
    for ri,r in enumerate(response_codes):
        if r in values["responses"]:
            data[index, len(request_types)+ri] = values["responses"][r]

In [31]:
df = pd.DataFrame(data=data, columns = request_types + response_codes)
df.insert(0, "IPs", ips)
df.head()

Unnamed: 0,IPs,GET,POST,HEAD,OPTIONS,PUT,TRACE,200,404,403,304,301,206,418,416,403.1,405,503,500
0,173.173.123.43,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,46.109.67.155,64.0,0.0,0.0,0.0,0.0,0.0,31.0,1.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0
2,192.227.226.2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,104.130.161.38,87.0,0.0,0.0,0.0,0.0,0.0,29.0,14.0,0.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.130.161.37,110.0,0.0,0.0,0.0,0.0,0.0,32.0,23.0,0.0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
df.tail()

Unnamed: 0,IPs,GET,POST,HEAD,OPTIONS,PUT,TRACE,200,404,403,304,301,206,418,416,403.1,405,503,500
5591,157.55.39.50,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5592,88.117.11.245,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5593,117.177.246.144,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5594,70.162.173.18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5595,119.81.190.175,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
data = normalize(data)

In [34]:
df_normalized = pd.DataFrame(data=data, columns = request_types + response_codes)
df_normalized.insert(0, "IPs", ips)
df_normalized.head()

Unnamed: 0,IPs,GET,POST,HEAD,OPTIONS,PUT,TRACE,200,404,403,304,301,206,418,416,403.1,405,503,500
0,173.173.123.43,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,46.109.67.155,0.820648,0.0,0.0,0.0,0.0,0.0,0.397501,0.012823,0.0,0.0,0.0,0.410324,0.0,0.0,0.0,0.0,0.0,0.0
2,192.227.226.2,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,104.130.161.38,0.84734,0.0,0.0,0.0,0.0,0.0,0.282447,0.136354,0.0,0.0,0.42854,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.130.161.37,0.851767,0.0,0.0,0.0,0.0,0.0,0.247787,0.178097,0.0,0.0,0.425883,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
df_normalized.tail()

Unnamed: 0,IPs,GET,POST,HEAD,OPTIONS,PUT,TRACE,200,404,403,304,301,206,418,416,403.1,405,503,500
5591,157.55.39.50,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5592,88.117.11.245,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5593,117.177.246.144,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5594,70.162.173.18,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5595,119.81.190.175,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Clustering

In [14]:
model = KMeans(n_clusters = 2)
clusters = model.fit_predict(data)

In [15]:
Counter(clusters.tolist())

Counter({0: 4412, 1: 1184})

In [16]:
target_label = clusters[ips.index("70.32.104.50")]
print(f"Malicious Label: {target_label}")

Malicious Label: 1


O código acima mostra em qual Label está o IP malicioso (70.32.104.50)

In [17]:
silhouette_scores = silhouette_samples(data, clusters)

In [19]:
for label in sorted(set(clusters.tolist())):
    score = np.array([])
    for i,cluster in enumerate(clusters):
        if cluster == label:
            score = np.append(score, silhouette_scores[i])
    print("------------------------------------------")
    print(f"Label: {label}")
    print(f"Number of ips in label: {len(score)}") 
    print(f"Average Silhouette Score: {score.mean()}")

------------------------------------------
Label: 0
Number of ips in label: 4412
Average Silhouette Score: 0.7481332908362535
------------------------------------------
Label: 1
Number of ips in label: 1184
Average Silhouette Score: 0.11636649316169519


Assim, deve ser analisados 1184 IP maliciosos. Um número muito grande

In [38]:
best_score = 0
for i in range(2,20):
    model = KMeans(n_clusters = i)
    clusters = model.fit_predict(data)
    cur_score = silhouette_score(data, model.labels_)
    if cur_score > best_score:
        best_score = cur_score
        best_n = i
        
print(f"Best Silhouette Score: {best_score}")
print(f"Best K: {best_n}")

Best Silhouette Score: 0.7954413866271935
Best K: 19


Número K melhores == 19 (definido analisando os comportamentos de todos os IPs)

In [39]:
model = KMeans(n_clusters = best_n)
clusters = model.fit_predict(data)
target_label = clusters[ips.index("70.32.104.50")]
print(f"Malicious Label: {target_label}")

Malicious Label: 5


O código acima mostra em qual Label está o IP malicioso.

In [40]:
silhouette_scores = silhouette_samples(data, clusters)

In [41]:
for label in sorted(set(clusters.tolist())):
    score = np.array([])
    for i,cluster in enumerate(clusters):
        if cluster == label:
            score = np.append(score, silhouette_scores[i])
    print("------------------------------------------")
    print(f"Label: {label}")
    print(f"Number of ips in label: {len(score)}") 
    print(f"Average Silhouette Score: {score.mean()}")

------------------------------------------
Label: 0
Number of ips in label: 772
Average Silhouette Score: 0.5710100519881945
------------------------------------------
Label: 1
Number of ips in label: 283
Average Silhouette Score: 0.7406274744288756
------------------------------------------
Label: 2
Number of ips in label: 3019
Average Silhouette Score: 0.923054022914855
------------------------------------------
Label: 3
Number of ips in label: 183
Average Silhouette Score: 0.9381612508172097
------------------------------------------
Label: 4
Number of ips in label: 8
Average Silhouette Score: 0.2791159365877305
------------------------------------------
Label: 5
Number of ips in label: 85
Average Silhouette Score: 0.7489619153479904
------------------------------------------
Label: 6
Number of ips in label: 78
Average Silhouette Score: 0.4642728260408099
------------------------------------------
Label: 7
Number of ips in label: 78
Average Silhouette Score: 0.9090088292405503
-----

Agora, teremos apenas 85 IP a serem analisados, um número bem menor quando comparado ao número incial

In [42]:
malicious_ip = []
for i,ip in enumerate(ips):
    if(clusters[i] == target_label):
        malicious_ip.append(ip)

In [43]:
malicious_df = pd.DataFrame(data={}, columns = ['IPs']+request_types + response_codes)
for m_ip in malicious_ip:
    malicious_df = malicious_df.append(df.loc[df['IPs'] == m_ip])
malicious_df

Unnamed: 0,IPs,GET,POST,HEAD,OPTIONS,PUT,TRACE,200,404,403,304,301,206,418,416,403.1,405,503,500
79,49.150.92.36,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,108.12.85.113,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
220,78.185.83.109,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
227,81.217.93.152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,210.121.164.67,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4259,180.179.212.214,66.0,66.0,0.0,0.0,0.0,0.0,0.0,132.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4260,181.65.186.34,87.0,81.0,0.0,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4483,180.179.212.237,116.0,116.0,0.0,0.0,0.0,0.0,0.0,232.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4484,212.48.75.159,32.0,32.0,0.0,0.0,0.0,0.0,0.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
