In [1]:
# Autoreload
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

from src import load_data as ld
from src import utils
from src import feature_exploring as fexp
from src import map_plot as mp
from src import ml_utils as mlu

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer

In [3]:
def min_max_scaler(train_col,test_col):
    
    minmax = MinMaxScaler()
    minmax.fit(train_col) 
    train_col_normalized = minmax.transform(train_col)
    
    minmax.fit(test_col) 
    test_col_normalized = minmax.transform(test_col)
    
    return train_col_normalized, test_col_normalized

Ahora necesito seleccionar todas las variables (o la mayoria) para posteriormente hacer reduccion de dimensionalidad con PCA

## Load data

In [4]:
gear_type_list = ['Drifting_longlines','Purse_seines','Trawlers']

In [5]:
# loading all dataframes
drifting_df, purse_df, trawlers_df = ld.load_multiple(gear_type_list)




Concatenating data...



  0%|          | 0/3 [00:00<?, ?it/s]


Loading Drifting_longlines...




  0%|          | 0/21 [00:00<?, ?it/s][A
  5%|▍         | 1/21 [00:02<00:42,  2.12s/it][A
 19%|█▉        | 4/21 [00:02<00:07,  2.29it/s][A
 43%|████▎     | 9/21 [00:02<00:02,  4.07it/s][A
100%|██████████| 21/21 [00:04<00:00,  5.22it/s][A
 33%|███▎      | 1/3 [00:04<00:08,  4.04s/it]


Loading Purse_seines...




  0%|          | 0/21 [00:00<?, ?it/s][A
 57%|█████▋    | 12/21 [00:00<00:00, 17.05it/s][A
100%|██████████| 21/21 [00:00<00:00, 22.94it/s][A
 67%|██████▋   | 2/3 [00:04<00:02,  2.21s/it]


Loading Trawlers...




  0%|          | 0/21 [00:00<?, ?it/s][A
 67%|██████▋   | 14/21 [00:01<00:00,  8.93it/s][A
100%|██████████| 21/21 [00:02<00:00,  7.66it/s][A
100%|██████████| 3/3 [00:07<00:00,  2.58s/it]


In [6]:
dfs = [drifting_df, purse_df, trawlers_df]

In [7]:
# lets label the data creating a new column called 'gear_type'
for df,name in zip(dfs,gear_type_list):
    df['gear_type'] = name

In [8]:
# concatenating them all
df_all_gears = pd.concat(dfs, 
                         ignore_index=True)

In [9]:
# selecting the columns
cols = fexp.column_select(df_all_gears,drop_always=True,drop_mmsi=True,col_groups=['1800','10800','21600','43200','86400'])

In [10]:
# droping nulls (I think should be filled with a regression)
df_all_gears_drop = df_all_gears[cols].dropna()
df_all_gears_drop.shape

(1531296, 64)

In [11]:
# sampling
df_all_gears_s = df_all_gears_drop.sample(400000,random_state=45)

In [25]:
#### creando un dataframe que el modelo NUNCA HA VISTO 
df_remains = df_all_gears_drop[~df_all_gears_drop.index.isin(df_all_gears_s.index)]
df_remains.drop('is_fishing', inplace=True, axis=1) # no lo necesito
df_remains = df_remains.sample(400000)

# tengo que escalar las columnas
df_remains['distance_from_port'] = utils.min_max(df_remains,'distance_from_port')
df_remains['distance_from_shore'] = utils.min_max(df_remains,'distance_from_shore')

# le quito a mi test la variable a predecir
X_df_remains = df_remains.drop('gear_type',axis=1)
y_df_remains = df_remains['gear_type']



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [12]:
# drops
df_all_gears_s.drop('is_fishing', inplace=True, axis=1)

In [13]:
# train test split
X = df_all_gears_s.drop('gear_type',axis=1)
y = df_all_gears_s['gear_type']
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [25]:
# saving mmsi numbers
# mmsi_train, mmsi_test = X_train['mmsi'], X_test['mmsi']
# X_train.drop('mmsi', inplace=True, axis=1)
# X_test.drop('mmsi', inplace=True, axis=1)

In [26]:
# saving gear types
# gear_train, gear_test = X_train['gear_type'], X_test['gear_type']
# X_train.drop('gear_type', inplace=True, axis=1)
# X_test.drop('gear_type', inplace=True, axis=1)

In [14]:
# scaling
X_train['distance_from_port'], X_test['distance_from_port'] = min_max_scaler(X_train[['distance_from_port']], 
                                                                             X_test[['distance_from_port']])

In [15]:
X_train['distance_from_shore'], X_test['distance_from_shore'] = min_max_scaler(X_train[['distance_from_shore']], 
                                                                             X_test[['distance_from_shore']])

## KNN

In [16]:
params = {
    'n_neighbors': [1,3,5,7,9],
    'weights': ['uniform','distance']
}

In [17]:
gs = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    cv=5
)

In [52]:
# knn = KNeighborsClassifier(n_neighbors=7,n_jobs=-1)

In [53]:
# lo entreno con el train

In [18]:
gs.fit(X_train,y_train)

In [35]:
X_train.columns

Index(['measure_cos_course', 'measure_sin_course', 'distance_from_port',
       'measure_course', 'distance_from_shore', 'measure_daylight',
       'measure_speed', 'measure_coursestddev_1800_log',
       'measure_daylightavg_1800', 'measure_speedstddev_1800',
       'measure_count_1800', 'measure_latavg_1800', 'measure_pos_1800',
       'measure_lonavg_1800', 'measure_courseavg_1800',
       'measure_coursestddev_1800', 'measure_speedavg_1800',
       'measure_speedstddev_1800_log', 'measure_coursestddev_10800_log',
       'measure_speedstddev_10800', 'measure_latavg_10800',
       'measure_speedavg_10800', 'measure_daylightavg_10800',
       'measure_courseavg_10800', 'measure_lonavg_10800', 'measure_pos_10800',
       'measure_count_10800', 'measure_speedstddev_10800_log',
       'measure_coursestddev_10800', 'measure_coursestddev_21600',
       'measure_speedavg_21600', 'measure_latavg_21600', 'measure_count_21600',
       'measure_coursestddev_21600_log', 'measure_lonavg_21600',
 

In [19]:
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(gs.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(gs.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 1.00
Accuracy of K-NN classifier on test set: 0.99


In [20]:
# prediccion con el test

In [22]:
knn = gs.best_estimator_
knn

In [23]:
pred = knn.predict(X_test)
print(confusion_matrix(y_test, pred))

[[50591   217   126]
 [  231 11937    92]
 [  120    97 36589]]


In [61]:
# ahora probemos con produccion

In [26]:
production_pred = knn.predict(X_df_remains) # tengo que predecir con la X de produccion

In [27]:
production_pred_prob = knn.predict_proba(X_df_remains)

In [28]:
print(confusion_matrix(y_df_remains, production_pred)) # en produccion, comparo lo real con lo predecido

[[201932    894    526]
 [   981  47697    412]
 [   464    353 146741]]


In [29]:
accuracy_score(y_df_remains,production_pred)

0.990925

In [30]:
roc_auc_score(y_df_remains, production_pred_prob, average="weighted", multi_class="ovr")

0.9926912610952551

In [31]:
f1_score(y_df_remains,production_pred,average='weighted')

0.9909212045906177

### Vamos a mirar uno a uno

In [32]:
y_df_remains.value_counts()

Drifting_longlines    203352
Trawlers              147558
Purse_seines           49090
Name: gear_type, dtype: int64

In [33]:
pd.Series(production_pred).value_counts()

Drifting_longlines    203377
Trawlers              147679
Purse_seines           48944
dtype: int64