## Import Necessary Libraries

In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Data Collection
• Taken data from gobal fishing watch  

In [2]:
ais_data = pd.read_csv("drifting_longlines.csv")

### Step 2: Data Pre-processing
• Clean and preprocess the AIS data

• Filter out irrelevant data, correct inconsistencies, and convert data into suitable format

•pre-processing steps

In [3]:

ais_data = ais_data.dropna() 


In [4]:
ais_data['timestamp'] = pd.to_datetime(ais_data['timestamp'])  # Convert timestamp to datetime objec
ais_data.head(5)

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
0,12639560000000.0,1970-01-01 00:00:01.327136504,232994.28125,311748.65625,8.2,230.5,14.865583,-26.853662,-1.0,dalhousie_longliner
1,12639560000000.0,1970-01-01 00:00:01.327136605,233994.265625,312410.34375,7.3,238.399994,14.86387,-26.8568,-1.0,dalhousie_longliner
2,12639560000000.0,1970-01-01 00:00:01.327136734,233994.265625,312410.34375,6.8,238.899994,14.861551,-26.860649,-1.0,dalhousie_longliner
3,12639560000000.0,1970-01-01 00:00:01.327143281,233994.265625,315417.375,6.9,251.800003,14.822686,-26.865898,-1.0,dalhousie_longliner
4,12639560000000.0,1970-01-01 00:00:01.327143341,233996.390625,316172.5625,6.1,231.100006,14.821825,-26.867579,-1.0,dalhousie_longliner


In [5]:
ais_data['speed_over_ground'] = ais_data['speed']  
ais_data['distance_from_shore'] = ais_data['distance_from_shore']  
ais_data['distance_from_port'] = ais_data['distance_from_port']  
ais_data['timestamp'] = pd.to_datetime(ais_data['timestamp'])  # Convert timestamp to datetime object
ais_data['time_elapsed'] = ais_data['timestamp'].diff().dt.total_seconds() 
ais_data['is_fishing'] = ais_data['is_fishing']  
ais_data['lat'] = ais_data['lat']  
ais_data['lon'] = ais_data['lon']  
ais_data['source'] = ais_data['source'] 

In [6]:
ais_data = ais_data.dropna()

In [7]:
ais_data['activity'] = ''  # Create an empty column for activity labels

for i, row in ais_data.iterrows():
    if row['is_fishing'] == 1:
        ais_data.at[i, 'activity'] = 'fishing'
    elif row['is_fishing'] == 0:
        ais_data.at[i, 'activity'] = 'no_fishing'
    else:
        ais_data.at[i, 'activity'] = 'other'

In [8]:
ais_data.to_csv("Final_data.csv")

In [9]:
ais_data.head(5)

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,speed_over_ground,time_elapsed,activity
1,12639560000000.0,1970-01-01 00:00:01.327136605,233994.265625,312410.34375,7.3,238.399994,14.86387,-26.8568,-1.0,dalhousie_longliner,7.3,1.01e-07,other
2,12639560000000.0,1970-01-01 00:00:01.327136734,233994.265625,312410.34375,6.8,238.899994,14.861551,-26.860649,-1.0,dalhousie_longliner,6.8,1.29e-07,other
3,12639560000000.0,1970-01-01 00:00:01.327143281,233994.265625,315417.375,6.9,251.800003,14.822686,-26.865898,-1.0,dalhousie_longliner,6.9,6.547e-06,other
4,12639560000000.0,1970-01-01 00:00:01.327143341,233996.390625,316172.5625,6.1,231.100006,14.821825,-26.867579,-1.0,dalhousie_longliner,6.1,6e-08,other
5,12639560000000.0,1970-01-01 00:00:01.327143411,233996.390625,316172.5625,6.9,242.699997,14.820652,-26.869459,-1.0,dalhousie_longliner,6.9,7e-08,other



### Split the data into training and testing datasets

In [10]:

from sklearn.model_selection import train_test_split

X = ais_data[['speed_over_ground', 'distance_from_shore', 'distance_from_port', 'time_elapsed', 'lat', 'lon']]
y = ais_data['activity']
# Split the data into training and testing datasets with modified parameters
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42)

### Model Training


In [11]:
scaler = StandardScaler()  # Initialize a scaler for feature scaling
X_train_scaled = scaler.fit_transform(X_train)  # Scale the training features
X_test_scaled = scaler.transform(X_test)  # Scale the testing features using the same scaler


RANDOM FOREST

In [12]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize a Random Forest classifier
clf.fit(X_train_scaled, y_train)  # Train the classifier on the training dataset

In [43]:
import pickle

filename = 'randomforest_model(9978).sav'
pickle.dump(clf, open(filename, 'wb'))

 Step 6: Model Evaluation
 
 Evaluate the trained model on the testing dataset

In [13]:

y_pred = clf.predict(X_test_scaled)  
accuracy = accuracy_score(y_test, y_pred)  
print("Accuracy:", accuracy) 
 


Accuracy: 0.9912400142318896


In [14]:
train_accuracy = accuracy_score(y_train, clf.predict(X_train_scaled))
print("Training Accuracy:", train_accuracy)


Training Accuracy: 0.9978440079385036


XGBOOST

In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)


In [16]:
from xgboost import XGBClassifier
xg = XGBClassifier(n_estimators=100,random_state=42)
xg.fit(X_train_scaled, y_train_encoded)

In [40]:
filename = 'XGBforest_model(9859).sav'
pickle.dump(model, open(filename, 'wb'))

In [17]:
label_map = {'fishing': 0, 'no_fishing': 1, 'other': 2}
y_true_int = [label_map[label] for label in y_test]

In [18]:
y_pred_int = xg.predict(X_test_scaled)

In [19]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_true_int, y_pred_int)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9859513781952847


In [20]:

y_pred_int = xg.predict(X_test_scaled)

# Convert true labels to integer format
y_true_int = [label_map[label] for label in y_test]

accuracy = accuracy_score(y_true_int, y_pred_int)

print(f"Testing Accuracy: {accuracy}")


Testing Accuracy: 0.9859513781952847


NAIVE BAYES

In [21]:
from sklearn.naive_bayes import GaussianNB


In [22]:
model = GaussianNB()
model.fit(X_train, y_train)



In [41]:
filename = 'GaussianNBforest_model(98.44).sav'
pickle.dump(model, open(filename, 'wb'))

In [23]:
# Make predictions on the test data
y_pred = model.predict(X_test)


In [25]:
# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Testing Accuracy: {:.2f}%".format(accuracy*100))

Testing Accuracy: 98.44%


KNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)


In [42]:
filename = 'KNN_model(98.44).sav'
pickle.dump(model, open(filename, 'wb'))

In [27]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 98.44%


Model Testing

In [7]:
import joblib
import pandas as pd

In [8]:
new_data = pd.read_csv("trollers.csv")

Done same data pre-processing as done for training data


In [9]:
new_data['timestamp'] = pd.to_datetime(new_data['timestamp'])  


In [10]:
new_data['speed_over_ground'] = new_data['speed']  
new_data['distance_from_shore'] = new_data['distance_from_shore']  
new_data['distance_from_port'] = new_data['distance_from_port']  
new_data['timestamp'] = pd.to_datetime(new_data['timestamp'])  
new_data['time_elapsed'] = new_data['timestamp'].diff().dt.total_seconds() 
new_data['is_fishing'] = new_data['is_fishing']  
new_data['lat'] = new_data['lat'] 
new_data['lon'] = new_data['lon'] 
new_data['source'] = new_data['source']  

In [11]:
for i, row in new_data.iterrows():
    if row['is_fishing'] == 1:
        new_data.at[i, 'activity'] = 'fishing'
    elif row['is_fishing'] == 0:
        new_data.at[i, 'activity'] = 'no_fishing'
    else:
        new_data.at[i, 'activity'] = 'other'

In [12]:
X_new = new_data[['speed_over_ground', 'distance_from_shore', 'distance_from_port', 'time_elapsed', 'lat', 'lon']]
y_new = new_data['activity']

In [13]:
X_new = new_data.dropna()

In [14]:
new_data.to_csv('test_data.csv', index=False)

In [17]:
loaded_model = joblib.load('randomforest_model(9978).sav')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [31]:
vessel_data = pd.DataFrame({
    'speed_over_ground': [3],
    'distance_from_shore': [999.9754638672],
    'distance_from_port': [11661.6171875],
    'time_elapsed': [0.000000839],
    'lat': [38.2435913086],
    'lon': [15.6551799774]
})

In [34]:
prediction = loaded_model.predict(vessel_data)



In [35]:
if prediction == 0:
    print("The vessel is not engaged in illegal fishing.")
else:
    print("The vessel is engaged in illegal fishing.")

The vessel is engaged in illegal fishing.
