# KMeans Practice - Netwotk Security

#### Malicious ip: 70.32.104.50
#### What other IPs are malicious?

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics import silhouette_samples, silhouette_score

### Preprocessing

In [2]:
data_path = "./data"

In [3]:
prevectors = {}

for path in os.listdir(data_path)[:200]:
    full_path = os.path.join(data_path, path)
    with open(full_path) as f:
        for line in f:
            ip = line.split()[0]
            request_type = line.split('"')[1].split()[0]
            response_code = line.split('"')[2].split()[0]
            
            if ip not in prevectors:
                prevectors[ip] = {"requests":{}, "responses": {}}
                
            if request_type not in prevectors[ip]["requests"]:
                prevectors[ip]["requests"][request_type] = 0
            
            prevectors[ip]["requests"][request_type] += 1
        
            if response_code not in prevectors[ip]["responses"]:
                prevectors[ip]["responses"][response_code] = 0
            
            prevectors[ip]["responses"][response_code] += 1

In [4]:
request_types = ["GET", "POST", "HEAD", "OPTIONS", "PUT", "TRACE"]
response_codes = ["200", "404", "403", "304", "301", "206", "418", "416", "403", "405", "503", "500"]

In [5]:
ips = []
data = np.zeros((len(prevectors.keys()), len(request_types) + len(response_codes)))
for index, (keys,values) in enumerate(prevectors.items()):
    ips.append(keys)
    for ri,r in enumerate(request_types):
        if r in values["requests"]:
            data[index, ri] = values["requests"][r]
    
    for ri,r in enumerate(response_codes):
        if r in values["responses"]:
            data[index, len(request_types)+ri] = values["responses"][r]

In [6]:
df = pd.DataFrame(data=data, columns = request_types + response_codes)
df.insert(0, "IPs", ips)
df.head()

Unnamed: 0,IPs,GET,POST,HEAD,OPTIONS,PUT,TRACE,200,404,403,304,301,206,418,416,403.1,405,503,500
0,173.173.123.43,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,46.109.67.155,64.0,0.0,0.0,0.0,0.0,0.0,31.0,1.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0
2,192.227.226.2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,104.130.161.38,87.0,0.0,0.0,0.0,0.0,0.0,29.0,14.0,0.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.130.161.37,110.0,0.0,0.0,0.0,0.0,0.0,32.0,23.0,0.0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
data = normalize(data)

In [8]:
df_normalized = pd.DataFrame(data=data, columns = request_types + response_codes)
df_normalized.insert(0, "IPs", ips)
df_normalized.head()

Unnamed: 0,IPs,GET,POST,HEAD,OPTIONS,PUT,TRACE,200,404,403,304,301,206,418,416,403.1,405,503,500
0,173.173.123.43,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,46.109.67.155,0.820648,0.0,0.0,0.0,0.0,0.0,0.397501,0.012823,0.0,0.0,0.0,0.410324,0.0,0.0,0.0,0.0,0.0,0.0
2,192.227.226.2,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,104.130.161.38,0.84734,0.0,0.0,0.0,0.0,0.0,0.282447,0.136354,0.0,0.0,0.42854,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.130.161.37,0.851767,0.0,0.0,0.0,0.0,0.0,0.247787,0.178097,0.0,0.0,0.425883,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Clustering

In [9]:
model = KMeans(n_clusters = 2)
clusters = model.fit_predict(data)

In [10]:
Counter(clusters.tolist())

Counter({0: 4412, 1: 1184})

In [11]:
target_label = clusters[ips.index("70.32.104.50")]
print(f"Malicious Label: {target_label}")

Malicious Label: 1


In [12]:
silhouette_scores = silhouette_samples(data, clusters)

In [13]:
for label in sorted(set(clusters.tolist())):
    score = np.array([])
    for i,cluster in enumerate(clusters):
        if cluster == label:
            score = np.append(score, silhouette_scores[i])
    print("------------------------------------------")
    print(f"Label: {label}")
    print(f"Number of ips in label: {len(score)}") 
    print(f"Average Silhouette Score: {score.mean()}")

------------------------------------------
Label: 0
Number of ips in label: 4412
Average Silhouette Score: 0.7481332909159913
------------------------------------------
Label: 1
Number of ips in label: 1184
Average Silhouette Score: 0.11636649315632999


In [14]:
best_score = 0
for i in range(2,20):
    model = KMeans(n_clusters = i)
    clusters = model.fit_predict(data)
    cur_score = silhouette_score(data, model.labels_)
    if cur_score > best_score:
        best_score = cur_score
        best_n = i
        
print(f"Best Silhouette Score: {best_score}")
print(f"Best K: {best_n}")

Best Silhouette Score: 0.7963403556784427
Best K: 19


In [15]:
model = KMeans(n_clusters = best_n)
clusters = model.fit_predict(data)
target_label = clusters[ips.index("70.32.104.50")]
print(f"Malicious Label: {target_label}")

Malicious Label: 1


In [16]:
silhouette_scores = silhouette_samples(data, clusters)

In [17]:
for label in sorted(set(clusters.tolist())):
    score = np.array([])
    for i,cluster in enumerate(clusters):
        if cluster == label:
            score = np.append(score, silhouette_scores[i])
    print("------------------------------------------")
    print(f"Label: {label}")
    print(f"Number of ips in label: {len(score)}") 
    print(f"Average Silhouette Score: {score.mean()}")

------------------------------------------
Label: 0
Number of ips in label: 3019
Average Silhouette Score: 0.9231747075452551
------------------------------------------
Label: 1
Number of ips in label: 73
Average Silhouette Score: 0.9218246643524736
------------------------------------------
Label: 2
Number of ips in label: 309
Average Silhouette Score: 0.7094259012513328
------------------------------------------
Label: 3
Number of ips in label: 182
Average Silhouette Score: 0.9548623760556885
------------------------------------------
Label: 4
Number of ips in label: 315
Average Silhouette Score: 0.5695189967080119
------------------------------------------
Label: 5
Number of ips in label: 42
Average Silhouette Score: 0.7467355209966107
------------------------------------------
Label: 6
Number of ips in label: 77
Average Silhouette Score: 0.9440439937040285
------------------------------------------
Label: 7
Number of ips in label: 313
Average Silhouette Score: 0.6843301703304413
--

In [18]:
malicious_ip = []
for i,ip in enumerate(ips):
    if(clusters[i] == target_label):
        malicious_ip.append(ip)

In [19]:
malicious_df = pd.DataFrame(data={}, columns = ['IPs']+request_types + response_codes)
for m_ip in malicious_ip:
    malicious_df = malicious_df.append(df.loc[df['IPs'] == m_ip])
malicious_df

Unnamed: 0,IPs,GET,POST,HEAD,OPTIONS,PUT,TRACE,200,404,403,304,301,206,418,416,403.1,405,503,500
239,148.211.121.62,4.0,4.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,176.31.252.38,1852.0,1864.0,0.0,0.0,0.0,0.0,0.0,3716.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
245,50.28.14.54,11.0,11.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
247,1.224.163.80,920.0,921.0,0.0,0.0,0.0,0.0,0.0,1839.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
249,87.106.187.164,8.0,8.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4258,37.9.169.26,60.0,60.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4259,180.179.212.214,66.0,66.0,0.0,0.0,0.0,0.0,0.0,132.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4260,181.65.186.34,87.0,81.0,0.0,0.0,0.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4483,180.179.212.237,116.0,116.0,0.0,0.0,0.0,0.0,0.0,232.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
