## 1. Import libraries

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd

## 2. Load the *Research* data set

In [2]:
loaded_data = pd.read_csv('events_bs2_2.csv', sep=',', header=0)

In [3]:
loaded_data.head(10)

Unnamed: 0,tactic0_id,longitude,latitude,NEAR_CCTV_,NEAR_MOOBA,MOOBAN_EST,MOOBAN_LEV,NEAR_UNITS,UNIT_TYPE,NEAR_VEHIC,VEHICLES_T,NEAR_NAIS_,NAIS_TYPE,NEAR_DIST
0,1,101.19182,6.13009,3467.485002,85.604275,0,0,927.963005,5,1308.145156,2,284.929088,3,27.677967
1,0,101.26577,6.41964,972.895329,1515.131446,0,0,1855.001821,4,2805.680397,2,1321.192664,1,1077.837387
2,1,101.14605,6.68195,4535.321847,372.279866,1,3,294.153727,7,212.795421,2,323.187802,10,97.315259
3,1,101.22266,6.85446,240.14691,333.135614,0,0,1591.508482,3,407.629181,1,155.356802,3,6.634046
4,0,101.34695,6.45786,242.448908,1172.773291,0,0,558.423389,5,701.641769,2,620.660475,5,31.915022
5,0,100.97757,6.65426,4096.016638,1315.821482,0,0,1716.60359,5,6956.354136,1,14218.0823,5,1124.033888
6,0,101.74995,6.27591,2686.242719,932.762119,1,3,1707.460629,4,3448.168296,2,69.781556,6,71.977883
7,0,101.6247,6.71488,580.185597,403.194092,0,0,567.060214,4,1545.663021,2,80.889155,4,35.425075
8,1,101.2866,6.72085,302.995168,143.336508,1,3,1104.123671,4,363.863675,2,1144.968962,3,25.827262
9,0,101.45094,6.49835,2562.292453,250.031476,0,0,3504.811321,4,7015.471466,2,2175.258547,6,14.428984


In [4]:
X = loaded_data.iloc[:,1:]
y = loaded_data.iloc[:,0]

In [5]:
X.head(5)

Unnamed: 0,longitude,latitude,NEAR_CCTV_,NEAR_MOOBA,MOOBAN_EST,MOOBAN_LEV,NEAR_UNITS,UNIT_TYPE,NEAR_VEHIC,VEHICLES_T,NEAR_NAIS_,NAIS_TYPE,NEAR_DIST
0,101.19182,6.13009,3467.485002,85.604275,0,0,927.963005,5,1308.145156,2,284.929088,3,27.677967
1,101.26577,6.41964,972.895329,1515.131446,0,0,1855.001821,4,2805.680397,2,1321.192664,1,1077.837387
2,101.14605,6.68195,4535.321847,372.279866,1,3,294.153727,7,212.795421,2,323.187802,10,97.315259
3,101.22266,6.85446,240.14691,333.135614,0,0,1591.508482,3,407.629181,1,155.356802,3,6.634046
4,101.34695,6.45786,242.448908,1172.773291,0,0,558.423389,5,701.641769,2,620.660475,5,31.915022


## 3. Input features
The ***iris*** data set contains 4 input features and 1 output variable (the class label).

### 3.1. Input features

In [6]:
print(X.head(5))

   longitude  latitude   NEAR_CCTV_   NEAR_MOOBA  MOOBAN_EST  MOOBAN_LEV  \
0  101.19182   6.13009  3467.485002    85.604275           0           0   
1  101.26577   6.41964   972.895329  1515.131446           0           0   
2  101.14605   6.68195  4535.321847   372.279866           1           3   
3  101.22266   6.85446   240.146910   333.135614           0           0   
4  101.34695   6.45786   242.448908  1172.773291           0           0   

    NEAR_UNITS  UNIT_TYPE   NEAR_VEHIC  VEHICLES_T   NEAR_NAIS_  NAIS_TYPE  \
0   927.963005          5  1308.145156           2   284.929088          3   
1  1855.001821          4  2805.680397           2  1321.192664          1   
2   294.153727          7   212.795421           2   323.187802         10   
3  1591.508482          3   407.629181           1   155.356802          3   
4   558.423389          5   701.641769           2   620.660475          5   

     NEAR_DIST  
0    27.677967  
1  1077.837387  
2    97.315259  
3     

### 3.2. Output features

In [7]:
print(y.head(5))

0    1
1    0
2    1
3    1
4    0
Name: tactic0_id, dtype: int64


## 4. Glimpse of the data

### 4.1. Input features

### 4.2. Output variable (the Class label)

### 4.3. Assigning *input* and *output* variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

### 4.3. Let's examine the data dimension

In [8]:
X.shape

(8000, 13)

In [9]:
y.shape

(8000,)

## 5. Build Classification Model using Random Forest

In [10]:
clf = RandomForestClassifier()

In [11]:
clf.fit(X, y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## 6. Feature Importance

In [12]:
print(clf.feature_importances_)

[0.12650267 0.1262397  0.1290176  0.10961479 0.00713869 0.01069162
 0.1107894  0.02177044 0.12344528 0.00797247 0.09829692 0.03576723
 0.09275319]


## 7. Make Prediction

## 8. Data split (80/20 ratio)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
X_train.shape, y_train.shape

((6400, 13), (6400,))

In [15]:
X_test.shape, y_test.shape

((1600, 13), (1600,))

## 9. Rebuild the Random Forest Model

In [16]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### 9.1. Performs prediction on single sample from the data set

### 9.2. Performs prediction on the test set

#### *Predicted class labels*

In [17]:
print(clf.predict(X_test))

[1 0 1 ... 1 1 1]


#### *Actual class labels*

print(y_test)

## 10. Model Performance

In [18]:
print(clf.score(X_test, y_test))

0.618125
