In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## <a id = "stepend"> Table of contents </a>
1. [Dataset Description](#1)
    * [Description of features](#2)
2. [A quick look at the data](#3)
3. [Baseline Model](#4)   
4. [Conclusion](#5)

# <a id = "1"> 1. Dataset Description</a>

This dataset is a set of measurements (24 016 units) of [boids](https://en.wikipedia.org/wiki/Boids), by which we can judge whether the birds are **grouped** at the moment, whether they are in a **flock** or **aligned**.

In this work, we will solve the *classification problem* for only one target variable - in a **flock** boids or not. (Class labels are binary, which 1 refers to flocking, grouped, and aligned, and 0 refers to not flocking, not grouped, and not aligned).
> Flocking behaviour refers to the way that groups of birds, insects, fish or other animals, move close to each other. They are able to move as a group with the same velocity, yet without running into each other.

## <a id = "2"> Description of features</a>

The features are:
 1. *xm* and *ym* as the (X,Y) position of each boid;
 2. *xVeln* and *yVeln* as the velocity vector;
 3. *xAm* and *yAm* as the alignment vector;
 4. *xSm* and *ySm* as the separation vector;
 5. *xCm* and *yCm* as the cohesion vector;
 6. *nACm* as the number of boids in the radius of Alignment/Cohesion;
 7. *nSm* as the number of boids in the radius of Separation.

These attributes are repeated for all m boids, where m = 1,...,200. (12*200 = 2 400 features)

So, as we can see, the dataset is very massive! Therefore, the main goal will be to create a high-quality predictive model on as few lines and features as possible.

# <a id = "3"> 2. A quick look at the data</a>

In [None]:
%%time
data = pd.read_csv("/kaggle/input/swarm-behavior/Swarm Behavior Data/Flocking.csv", low_memory = False) 
data

In [None]:
data.info()

In [None]:
# let's immediately fix the bug in the name of the class column and x1[24015]
data['Class'] = data['Class ']
data = data.drop(['Class '], axis=1)
data.x1[24015] = 0

# <a id = "4"> 3. Baseline Model </a>

A baseline is a model that is both simple to set up and has a reasonable chance of providing decent results.
As such a model, we will choose the *RandomForestClassifier* in the basic configuration (without selection of parameters) on all the presented data. Yes, it is resource-intensive, but as long as we can afford it, we will do it. We need this result for a comparative analysis to show that on a modified dataset we will not lose accuracy, while improving accuracy (we count on this).

In [None]:
X_train = data.copy()
y = X_train['Class']
X_train = X_train.drop(['Class'], axis=1)

In [None]:
X_train = X_train.apply(pd.to_numeric)

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

s_X_train, s_X_test, s_y_train, s_y_test = train_test_split(X_train, y, train_size = 0.2, test_size = 0.8, random_state = 0)

model_baseline = RandomForestClassifier()
model_baseline.fit(s_X_train, s_y_train)

baseline_predictions = model_baseline.predict(s_X_test)

In [None]:
# Let's evaluate the model. As the metric of the classification problem, we choose ROC_AUC
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt

roc_auc_score(s_y_test, baseline_predictions)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(baseline_predictions, s_y_test)

The model has extremely high prediction accuracy, let's determine how many rows we need to achieve almost 100% prediction accuracy.

In [None]:
s_X_train_1, s_X_test_1, s_y_train_1, s_y_test_1 = train_test_split(X_train, y, train_size = 0.0003, test_size = 0.9997, random_state = 0)
model_baseline.fit(s_X_train_1, s_y_train_1)
baseline_predictions_1 = model_baseline.predict(s_X_test_1)
a_1 = roc_auc_score(s_y_test_1, baseline_predictions_1)

s_X_train_2, s_X_test_2, s_y_train_2, s_y_test_2 = train_test_split(X_train, y, train_size = 0.0005, test_size = 0.9995, random_state = 0)
model_baseline.fit(s_X_train_2, s_y_train_2)
baseline_predictions_2 = model_baseline.predict(s_X_test_2)
a_2 = roc_auc_score(s_y_test_2, baseline_predictions_2)

s_X_train_3, s_X_test_3, s_y_train_3, s_y_test_3 = train_test_split(X_train, y, train_size = 0.0007, test_size = 0.9993, random_state = 0)
model_baseline.fit(s_X_train_3, s_y_train_3)
baseline_predictions_3 = model_baseline.predict(s_X_test_3)
a_3 = roc_auc_score(s_y_test_3, baseline_predictions_3)

s_X_train_4, s_X_test_4, s_y_train_4, s_y_test_4 = train_test_split(X_train, y, train_size = 0.001, test_size = 0.999, random_state = 0)
model_baseline.fit(s_X_train_4, s_y_train_4)
baseline_predictions_4 = model_baseline.predict(s_X_test_4)
a_4 = roc_auc_score(s_y_test_4, baseline_predictions_4)

s_X_train_5, s_X_test_5, s_y_train_5, s_y_test_5 = train_test_split(X_train, y, train_size = 0.0012, test_size = 0.9988, random_state = 0)
model_baseline.fit(s_X_train_5, s_y_train_5)
baseline_predictions_5 = model_baseline.predict(s_X_test_5)
a_5 = roc_auc_score(s_y_test_5, baseline_predictions_5)

s_X_train_6, s_X_test_6, s_y_train_6, s_y_test_6 = train_test_split(X_train, y, train_size = 0.004, test_size = 0.996, random_state = 0)
model_baseline.fit(s_X_train_6, s_y_train_6)
baseline_predictions_6 = model_baseline.predict(s_X_test_6)
a_6 = roc_auc_score(s_y_test_6, baseline_predictions_6)

s_X_train_7, s_X_test_7, s_y_train_7, s_y_test_7 = train_test_split(X_train, y, train_size = 0.05, test_size = 0.95, random_state = 0)
model_baseline.fit(s_X_train_7, s_y_train_7)
baseline_predictions_7 = model_baseline.predict(s_X_test_7)
a_7 = roc_auc_score(s_y_test_7, baseline_predictions_7)

print("Using to train 0.03% of data:", a_1, "\nUsing to train 0.05% of data:", a_2, 
      "\nUsing to train 0.07% of data:", a_3, "\nUsing to train 0.1% of data:", a_4, 
      "\nUsing to train 0.12% of data:", a_5,"\nUsing to train 0.4% of data:",  a_6, 
      "\nUsing to train 5% of data:", a_7)

5% of data it is just 0.05 * 24016 = 1 201 rows to predict other 22 815 rows!

# <a id = "5"> 4. Conclusions </a>

As far as can be judged from this analysis, predicting whether birds are in flock or not based on so many features is fairly straightforward for the base model.