In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load the dataset
# Replace 'veremi.csv' with the path to your dataset
data = pd.read_csv('/Users/suchithkurra/Desktop/capstone /sampled_dataset.csv')

In [3]:
# Assuming that the last column is the target variable and others are features
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

In [4]:
# Apply Label Encoding on the target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)


In [6]:
# Apply SelectKBest for feature extraction (using ANOVA F-value here)
# Adjust 'k' to the number of top features you want to select
k = 10
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Get the selected feature indices and names
selected_feature_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]

# Print the selected features
print(f'Selected {k} features: {selected_features.tolist()}')

Selected 10 features: ['type', 'pos_0', 'pos_1', 'pos_noise_0', 'pos_noise_1', 'spd_0', 'spd_1', 'acl_0', 'acl_1', 'hed_noise_1']


In [7]:
# Create and train the Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust n_estimators as needed
rf_classifier.fit(X_train_selected, y_train)

In [8]:
# Make predictions
y_pred = rf_classifier.predict(X_test_selected)

In [9]:

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 17.65%
                    precision    recall  f1-score   support

          ConstPos       0.12      0.16      0.14       289
    ConstPosOffset       0.18      0.17      0.18       314
        ConstSpeed       0.15      0.15      0.15       277
  ConstSpeedOffset       0.17      0.20      0.19       288
        DataReplay       0.07      0.09      0.08       308
   DataReplaySybil       0.10      0.12      0.11       330
   DelayedMessages       0.24      0.23      0.23       319
        Disruptive       0.10      0.11      0.11       307
               DoS       0.08      0.10      0.09       343
     DoSDisruptive       0.12      0.14      0.13       341
DoSDisruptiveSybil       0.17      0.17      0.17       339
         DoSRandom       0.37      0.34      0.35       340
    DoSRandomSybil       0.17      0.09      0.12       293
      EventualStop       0.30      0.25      0.27       311
         GridSybil       0.45      0.44      0.44       358
         RandomPos    