### Importing the dataset

In [1]:
import pandas as pd
df = pd.read_csv("Swarm_Behaviour.csv")
df.head()

Unnamed: 0,x1,y1,xVel1,yVel1,xA1,yA1,xS1,yS1,xC1,yC1,...,yVel200,xA200,yA200,xS200,yS200,xC200,yC200,nAC200,nS200,Swarm_Behaviour
0,562.05,-0.62,-10.7,-4.33,0.0,0.0,0.0,0.0,0.0,0.0,...,-15.15,0.0,0.0,0.0,0.0,0.0,0.0,28,0,0.0
1,175.66,-57.09,2.31,-2.67,0.0,0.0,0.0,0.0,0.0,0.0,...,-3.48,0.0,0.0,0.0,0.0,0.0,0.0,4,0,0.0
2,200.16,-320.07,4.01,-6.37,0.0,0.0,0.0,0.0,0.18,-0.26,...,-9.38,0.0,0.0,0.0,0.0,-0.11,-0.3,15,1,0.0
3,316.99,-906.84,0.85,9.17,-0.17,1.03,0.0,0.0,0.0,0.0,...,10.39,-0.26,1.01,0.0,0.0,0.0,0.0,16,0,0.0
4,1277.68,908.54,-2.02,8.23,-1.0,1.0,0.0,0.0,0.0,0.0,...,13.91,-1.0,0.0,3.21,15.67,0.0,0.0,12,0,0.0


In [2]:
count=0
for i in df:
    if df['x1'].isnull().sum()!=0:
        count+=1
print("Count of columns with missing value(s):",count)

Count of columns with missing value(s): 0


In [3]:
len(df)

23309

### Data Preprocessing
As there are no NULL values, fillna or dropna function need not be used. Apart from that feature selection can be done to select the relevant columns for building the model, followed by hyperparameter tuning to arrive at a good accuracy score.

In [4]:
df.describe()

Unnamed: 0,x1,y1,xVel1,yVel1,xA1,yA1,xS1,yS1,xC1,yC1,...,yVel200,xA200,yA200,xS200,yS200,xC200,yC200,nAC200,nS200,Swarm_Behaviour
count,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,...,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0,23309.0
mean,104.413631,-73.481853,-0.697607,0.06148,-0.147964,0.152034,-1.055732,-0.731546,-0.034915,0.077123,...,0.086824,-0.126276,0.127497,-1.52626,-0.118891,-0.012388,0.085076,26.433995,2.130679,0.341242
std,843.200079,573.329374,6.42784,7.366739,0.376078,0.588582,26.38857,20.684183,0.55969,0.628478,...,7.683539,0.37883,0.570589,73.903764,7.002853,0.555143,0.614603,34.136098,7.431911,0.474136
min,-1414.14,-1012.18,-18.59,-18.41,-1.0,-1.0,-944.07,-847.91,-2.68,-2.68,...,-18.44,-1.04,-1.0,-4079.23,-370.24,-2.68,-2.68,0.0,0.0,0.0
25%,-542.02,-611.22,-5.36,-5.68,-0.23,-0.01,0.0,0.0,-0.04,-0.02,...,-5.6,-0.19,0.0,0.0,0.0,-0.07,0.0,2.0,0.0,0.0
50%,145.3,-157.35,-0.45,-1.54,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.78,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0
75%,896.2,425.7,3.94,7.26,0.0,0.94,0.0,0.0,0.0,0.02,...,7.48,0.0,0.54,0.0,0.0,0.0,0.03,35.0,1.0,1.0
max,1406.08,1015.8,18.47,18.57,1.0,1.04,68.56,234.74,2.68,2.68,...,18.49,1.0,1.04,255.99,57.84,2.68,2.68,142.0,64.0,1.0


### Feature Selection

In [5]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, f_classif # ANOVA

X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

#### Method I : ANOVA (ANalysis Of VAriance)

In [6]:
# hyperparameter tuning of k is required to arrive at a good accuracy
# k = select k best features
fs = SelectKBest(score_func=f_classif, k=50) # k - tuned, tried for values above and below to reach at this threshold

# applying feature selection
X_new = fs.fit_transform(X, y)
print(X_new.shape)

(23309, 50)


In [7]:
X_new

array([[ -5.08,   7.12,  -0.22, ..., -10.2 , -11.68,   2.97],
       [ -9.99,  10.21,  10.72, ..., -13.87,  -7.43,   3.87],
       [  1.78,   9.99,  -2.56, ...,  -8.38, -10.15,  -4.18],
       ...,
       [-16.55,   2.91,  -0.22, ..., -11.96,  -3.77,  15.2 ],
       [ -0.53,  -7.2 , -13.08, ...,  -4.08,  -5.17, -10.27],
       [ -0.12,   2.57,   0.98, ...,  -9.22,  -7.95,   0.07]])

#### Method II : Mutual Information Statistic

In [8]:
# from sklearn.feature_selection import mutual_info_classif

# fs = SelectKBest(score_func=mutual_info_classif, k=10)
# X_new = fs.fit_transform(X,y)
# X_new
# # X_train_fs = fs.transform(X_train_new)
# # X_test_fs = fs.transform(X_test_new)

### Classification Algorithms

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.30,random_state=1) # 70:30 split ratio
X_train

array([[ -5.81,   8.92,   8.17, ..., -10.02,  -7.45,   3.88],
       [  7.65,   7.95,   3.73, ...,   8.  ,  13.46,   3.07],
       [  7.46,  -9.89,  -8.21, ...,  -2.73,   7.5 ,  -4.59],
       ...,
       [  9.61,  -0.88,  -3.28, ...,   9.22,   7.09,  -1.06],
       [ -2.63,  -7.11,  -9.7 , ...,  -0.21,  -0.44,  -8.56],
       [ -5.2 ,  -2.15,   3.63, ...,  -3.17,   4.4 ,   6.2 ]])

In [10]:
y_train

array([0., 0., 1., ..., 1., 1., 0.])

### Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_test

array([[-0.11470773,  0.16051554,  0.40419523, ...,  0.55484422,
         0.86679245, -0.09612846],
       [ 1.03765852, -0.21143613, -0.04525888, ...,  1.32972004,
         1.14065673, -0.24909266],
       [ 1.22295685, -0.66424685,  0.05421047, ...,  1.11454791,
         0.92029152,  0.13726023],
       ...,
       [-1.04796208,  0.52923284, -0.22025126, ..., -1.10889741,
        -1.68715115,  1.53601538],
       [-0.91676546, -0.8146447 , -0.05078496, ..., -1.39195122,
        -1.22858771,  0.40376487],
       [ 1.1134009 ,  0.36589754,  0.2181507 , ...,  1.20932611,
         1.11772856, -0.12451357]])

### 1. Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=1000) # max_iter varies but not linearly with k in SelectKBest function
log_reg.fit(X_train,y_train)

In [13]:
y_pred = log_reg.predict(X_test)
y_pred

array([0., 1., 1., ..., 0., 0., 1.])

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
confusion_matrix(y_test,y_pred)

array([[4243,  356],
       [ 341, 2053]], dtype=int64)

In [15]:
accuracy_score(y_test,y_pred)

0.9003289003289003

In [16]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.92      0.92      4599
         1.0       0.85      0.86      0.85      2394

    accuracy                           0.90      6993
   macro avg       0.89      0.89      0.89      6993
weighted avg       0.90      0.90      0.90      6993



### 2. K Nearest Neighbour

In [17]:
from sklearn.neighbors import KNeighborsClassifier

# k has to be odd and optimal value = sqrt(total no. of samples)
# sqrt(23309) rounds off to nearest odd integer 153 and hence n_neighbors=153
knn = KNeighborsClassifier(n_neighbors=90) # but 90 gives better accuracy after hyperparameter tuning

In [18]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_pred

array([0., 1., 1., ..., 0., 0., 1.])

In [19]:
print("Confusion_matrix:\n",confusion_matrix(y_test,y_pred))
print("\nAccuracy_score:",accuracy_score(y_test,y_pred))

Confusion_matrix:
 [[4215  384]
 [ 280 2114]]

Accuracy_score: 0.905047905047905


In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.92      0.93      4599
         1.0       0.85      0.88      0.86      2394

    accuracy                           0.91      6993
   macro avg       0.89      0.90      0.90      6993
weighted avg       0.91      0.91      0.91      6993



### 3. Naive Bayes (Gaussian)

In [21]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB # Gaussian has better accuracy
nb = GaussianNB()

In [22]:
nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)
y_pred

array([0., 1., 1., ..., 0., 0., 1.])

In [23]:
print("Confusion_matrix:\n",confusion_matrix(y_test,y_pred))
print("\nAccuracy_score:",accuracy_score(y_test,y_pred))

Confusion_matrix:
 [[3646  953]
 [ 302 2092]]

Accuracy_score: 0.8205348205348205


In [24]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.79      0.85      4599
         1.0       0.69      0.87      0.77      2394

    accuracy                           0.82      6993
   macro avg       0.81      0.83      0.81      6993
weighted avg       0.84      0.82      0.82      6993



### 4. Decision Tree Classifier (Gini Impurity Method)

In [25]:
from sklearn.tree import DecisionTreeClassifier
dt_reg = DecisionTreeClassifier(criterion='gini')
dt_reg.fit(X_train,y_train)

In [26]:
y_pred = dt_reg.predict(X_test)
y_pred

array([0., 1., 1., ..., 0., 0., 1.])

In [27]:
print("Confusion_matrix:\n",confusion_matrix(y_test,y_pred))
print("\nAccuracy_score:",accuracy_score(y_test,y_pred))

Confusion_matrix:
 [[4182  417]
 [ 493 1901]]

Accuracy_score: 0.8698698698698699


In [28]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.91      0.90      4599
         1.0       0.82      0.79      0.81      2394

    accuracy                           0.87      6993
   macro avg       0.86      0.85      0.85      6993
weighted avg       0.87      0.87      0.87      6993



### 5. Decision Tree Classifier (Entropy Method)

In [29]:
from sklearn.tree import DecisionTreeClassifier
dt_reg2 = DecisionTreeClassifier(criterion='entropy')
dt_reg2.fit(X_train,y_train)

In [30]:
y_pred = dt_reg2.predict(X_test)
y_pred

array([0., 1., 1., ..., 0., 0., 1.])

In [31]:
print("Confusion_matrix:\n",confusion_matrix(y_test,y_pred))
print("\nAccuracy_score:",accuracy_score(y_test,y_pred))

Confusion_matrix:
 [[4187  412]
 [ 461 1933]]

Accuracy_score: 0.8751608751608752


In [32]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.91      0.91      4599
         1.0       0.82      0.81      0.82      2394

    accuracy                           0.88      6993
   macro avg       0.86      0.86      0.86      6993
weighted avg       0.87      0.88      0.87      6993



### Conclusion:
Max Accuracy: 90.05%  
Achieved By: KNN Algorithm (relatively better)