In [5]:
# Importing librarys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [6]:
# Load the breast cancer dataset
df = pd.read_csv('android_traffic.csv', delimiter=";")
df.head(7)

Unnamed: 0,name,tcp_packets,dist_port_tcp,external_ips,vulume_bytes,udp_packets,tcp_urg_packet,source_app_packets,remote_app_packets,source_app_bytes,remote_app_bytes,duracion,avg_local_pkt_rate,avg_remote_pkt_rate,source_app_packets.1,dns_query_times,type
0,AntiVirus,36,6,3,3911,0,0,39,33,5100,4140,,,,39,3,benign
1,AntiVirus,117,0,9,23514,0,0,128,107,26248,24358,,,,128,11,benign
2,AntiVirus,196,0,6,24151,0,0,205,214,163887,24867,,,,205,9,benign
3,AntiVirus,6,0,1,889,0,0,7,6,819,975,,,,7,1,benign
4,AntiVirus,6,0,1,882,0,0,7,6,819,968,,,,7,1,benign
5,AntiVirus,54,54,3,5062,0,0,63,54,5457,5719,,,,63,9,benign
6,AntiVirus,6,0,1,889,0,0,7,6,819,975,,,,7,1,benign


In [7]:
# Iterate over all columns of the DataFrame
for col in df.columns:
    # Get the count of different items in the column
    item_counts = df[col].value_counts()
    
    # Print the count of each item
    print(f"Column '{col}':\n{item_counts}\n")

Column 'name':
Reading             774
Plankton            483
DroidKungFu         427
AntiVirus           396
NewsAndMagazines    360
                   ... 
Saiva                 1
Gmuse                 1
EICAR-Test-File       1
EWalls                1
SafeKidZone           1
Name: name, Length: 114, dtype: int64

Column 'tcp_packets':
0       1267
6        784
12       340
5        210
7        124
        ... 
3998       1
313        1
2596       1
751        1
1486       1
Name: tcp_packets, Length: 760, dtype: int64

Column 'dist_port_tcp':
0      6243
12      276
6       219
8        61
11       61
       ... 
164       1
223       1
74        1
329       1
139       1
Name: dist_port_tcp, Length: 169, dtype: int64

Column 'external_ips':
1     2391
2     1408
0     1068
3      844
4      576
5      435
6      345
7      254
8      171
9      133
10      66
11      34
13      32
12      21
15      17
16      10
18       9
14       8
17       7
19       3
22       2
21       2
32

In [8]:
# Pre-processing

# Remove additional columns
# Identify columns to drop and Drop the columns
columns_to_drop = ['duracion', 'avg_local_pkt_rate','avg_remote_pkt_rate','tcp_urg_packet','udp_packets']
df = df.drop(columns_to_drop, axis=1)

# Drop rows with null values
df = df.dropna()

# Encode categorical variables
encoder = LabelEncoder()
categorical_cols = ['name','type']
for col in categorical_cols:
  df[col] = encoder.fit_transform(df[col])

# Calculate the Z-scores of each column and Remove rows with outliers
z_scores = stats.zscore(df)
df = df[(z_scores < 3).all(axis=1)]

# Display the data
print(f'Shape of the data : {df.shape}')
df.head()

Shape of the data : (7567, 12)


Unnamed: 0,name,tcp_packets,dist_port_tcp,external_ips,vulume_bytes,source_app_packets,remote_app_packets,source_app_bytes,remote_app_bytes,source_app_packets.1,dns_query_times,type
0,6,36,6,3,3911,39,33,5100,4140,39,3,0
1,6,117,0,9,23514,128,107,26248,24358,128,11,0
2,6,196,0,6,24151,205,214,163887,24867,205,9,0
3,6,6,0,1,889,7,6,819,975,7,1,0
4,6,6,0,1,882,7,6,819,968,7,1,0


In [9]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df.drop("type", axis=1), df["type"], test_size=0.3, random_state=42)

# Standardize the data
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Print the standardized training data
print(f'x_train shape: {x_train.shape}')
print(f'y_train shape: {y_train.shape}')
print('--------------')
print(f'x_test shape: {x_test.shape}')
print(f'y_test shape: {y_test.shape}')

x_train shape: (5296, 11)
y_train shape: (5296,)
--------------
x_test shape: (2271, 11)
y_test shape: (2271,)


In [10]:
# Train the KNN model with k=4
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train, y_train)

# Make predictions on the test data
y_pred_knn = knn.predict(x_test)


# Compute the accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)

In [11]:
# Train the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(x_train, y_train)

# Make predictions on the testing set
y_pred_clf = clf.predict(x_test)

# Evaluate the accuracy of the classifier
accuracy_clf = accuracy_score(y_test, y_pred_clf)

In [None]:
# Printing the results of Evaluation
print("Accuracy of KNN:", accuracy_knn)
print('Accuracy of Decision Tree Classifier:', accuracy_clf)

Accuracy of KNN: 0.9295464553060326
Accuracy of Decision Tree Classifier: 0.9828269484808454
