In [None]:
# install full version
!pip install pycaret[full]



In [1]:
!pip install git+https://github.com/pycaret/pycaret.git@master --upgrade

Collecting git+https://github.com/pycaret/pycaret.git@master
  Cloning https://github.com/pycaret/pycaret.git (to revision master) to /tmp/pip-req-build-3qtitf2q
  Running command git clone --filter=blob:none --quiet https://github.com/pycaret/pycaret.git /tmp/pip-req-build-3qtitf2q
  Resolved https://github.com/pycaret/pycaret.git to commit 885ebb81055ac017e50080db7731973d1a222e19
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting joblib<1.4,>=1.2.0 (from pycaret==3.3.2)
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>1.4.0 (from pycaret==3.3.2)
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret==3.3.2)
  D

Explanation of this project :

Based on the problem description of predicting the plant species present at a given location, this would typically involve a classification machine learning method rather than linear regression.


Classification is a supervised learning technique where the goal is to assign a categorical class label (e.g., plant species name) to new data points based on learning from a training dataset containing examples with known class labels.

Some common classification algorithms that could be applied to this plant species prediction problem include:


1) Decision Trees

2) Random Forests

3) Support Vector Machines (SVMs)

4) Naive Bayes

5) Logistic Regression

6) Neural Networks



Linear regression, on the other hand, is used for predicting a continuous numerical value output rather than a categorical class. It models the relationship between independent variables (e.g. environmental factors) and a dependent variable (e.g. plant abundance) as a linear equation.

Since the goal here is to predict the specific plant species names/labels at a location based on environmental conditions, this fits better as a classification task. Each plant species would be treated as a distinct class label.

However, in some cases, researchers may first use regression methods to model species abundance/distribution as a function of environmental variables, and then classify presence/absence based on the predicted abundance values. But predominantly, species distribution modeling relies on classification algorithms to directly predict the categorical species labels.

In [2]:
import pandas as pd

In [5]:
data = pd.read_csv('/content/GLC24_PA_metadata_train.csv')

In [6]:
data.head(10)

Unnamed: 0,lon,lat,year,geoUncertaintyInM,areaInM2,region,country,speciesId,surveyId
0,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,6874.0,212
1,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,476.0,212
2,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,11157.0,212
3,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,8784.0,212
4,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,4530.0,212
5,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,10520.0,212
6,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,9458.0,212
7,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,982.0,212
8,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,51.0,212
9,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,3935.0,212


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32698 entries, 0 to 32697
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   lon                32698 non-null  float64
 1   lat                32698 non-null  float64
 2   year               32698 non-null  int64  
 3   geoUncertaintyInM  32379 non-null  float64
 4   areaInM2           28636 non-null  float64
 5   region             32698 non-null  object 
 6   country            32698 non-null  object 
 7   speciesId          32698 non-null  float64
 8   surveyId           32698 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 2.2+ MB


In [8]:
import pandas as pd

# Remove rows with missing values
data.dropna(inplace=True)

In [9]:
data

Unnamed: 0,lon,lat,year,geoUncertaintyInM,areaInM2,region,country,speciesId,surveyId
0,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,6874.0,212
1,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,476.0,212
2,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,11157.0,212
3,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,8784.0,212
4,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,4530.0,212
...,...,...,...,...,...,...,...,...,...
32693,9.572950,54.923780,2019,10.0,707.0,CONTINENTAL,Denmark,8512.0,90805
32694,9.572950,54.923780,2019,10.0,707.0,CONTINENTAL,Denmark,5114.0,90805
32695,9.572950,54.923780,2019,10.0,707.0,CONTINENTAL,Denmark,963.0,90805
32696,9.572950,54.923780,2019,10.0,707.0,CONTINENTAL,Denmark,1888.0,90805


In [10]:
# Convert string columns to numeric
numeric_columns = ['speciesId', 'surveyId']  # Specify the columns that should be numeric

In [11]:
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

In [12]:
data

Unnamed: 0,lon,lat,year,geoUncertaintyInM,areaInM2,region,country,speciesId,surveyId
0,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,6874.0,212
1,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,476.0,212
2,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,11157.0,212
3,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,8784.0,212
4,3.099038,43.134956,2021,5.0,100.0,MEDITERRANEAN,France,4530.0,212
...,...,...,...,...,...,...,...,...,...
32693,9.572950,54.923780,2019,10.0,707.0,CONTINENTAL,Denmark,8512.0,90805
32694,9.572950,54.923780,2019,10.0,707.0,CONTINENTAL,Denmark,5114.0,90805
32695,9.572950,54.923780,2019,10.0,707.0,CONTINENTAL,Denmark,963.0,90805
32696,9.572950,54.923780,2019,10.0,707.0,CONTINENTAL,Denmark,1888.0,90805


In [13]:
# using pandas to identify and remove rows with infinite values

import pandas as pd

# Assuming df is your DataFrame containing the dataset
# Check for infinite values in the DataFrame
has_infinite = data.isin([float('inf'), float('-inf')]).any(axis=None)

# If any infinite values are found, remove the corresponding rows
if has_infinite:
    data = data[~data.isin([float('inf'), float('-inf')]).any(axis=1)]

# Now you can proceed with your analysis or further processing

In [14]:
import pandas as pd

# Assuming 'data' is your DataFrame
data['speciesId'] = data['speciesId'].astype('int64')

# Display the DataFrame to confirm the change
print(data.dtypes)

lon                  float64
lat                  float64
year                   int64
geoUncertaintyInM    float64
areaInM2             float64
region                object
country               object
speciesId              int64
surveyId               int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['speciesId'] = data['speciesId'].astype('int64')


In [15]:
# Check the number of unique values in the target variable
unique_values = data['speciesId'].nunique()

# Print the result
print(f"Number of unique values in 'speciesId': {unique_values}")

Number of unique values in 'speciesId': 1971


Classification using PyCaret

In [16]:
from pycaret.classification import setup
import pandas as pd
from sklearn.datasets import make_classification

# Create a sample dataset with imbalanced classes
data = make_classification(n_samples=100, n_features=4, n_classes=3, n_clusters_per_class=1, weights=[0.6, 0.3, 0.1], random_state=42)
df = pd.DataFrame(data[0], columns=[f'feature_{i}' for i in range(1, 5)])
df['speciesId'] = data[1]

# Check class distribution
class_distribution = df['speciesId'].value_counts()
print("Class distribution:")
print(class_distribution)

# Check if any class has only one instance
if class_distribution.min() < 2:
    # Handle the situation where a class has only one instance
    print("One or more classes have only one instance. Handle this situation appropriately.")
else:
    # Setup the Pycaret classification
    s = setup(data=df, target='speciesId', session_id=123)
    # Continue with model training


Class distribution:
speciesId
0    60
1    30
2    10
Name: count, dtype: int64


Unnamed: 0,Description,Value
0,Session id,123
1,Target,speciesId
2,Target type,Multiclass
3,Original data shape,"(100, 5)"
4,Transformed data shape,"(100, 5)"
5,Transformed train set shape,"(70, 5)"
6,Transformed test set shape,"(30, 5)"
7,Numeric features,4
8,Preprocess,True
9,Imputation type,simple


In [21]:
# OOP API
best = s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9857,0.0,0.9,0.9,0.9,0.9682,0.9715,0.62
knn,K Neighbors Classifier,0.9857,0.7,0.9,0.9,0.9,0.9682,0.9715,0.045
qda,Quadratic Discriminant Analysis,0.9857,0.0,0.9,0.9,0.9,0.9682,0.9715,0.034
nb,Naive Bayes,0.9714,0.7,0.8857,0.8762,0.88,0.9423,0.9487,0.04
svm,SVM - Linear Kernel,0.9714,0.0,0.8857,0.8762,0.88,0.9423,0.9487,0.037
ridge,Ridge Classifier,0.9714,0.0,0.8857,0.8762,0.88,0.9423,0.9487,0.028
xgboost,Extreme Gradient Boosting,0.9714,0.7,0.8857,0.8743,0.8794,0.9402,0.9476,0.068
lightgbm,Light Gradient Boosting Machine,0.9714,0.7,0.8857,0.8762,0.88,0.9423,0.9487,0.087
lda,Linear Discriminant Analysis,0.9571,0.0,0.8714,0.869,0.8657,0.9181,0.9272,0.031
et,Extra Trees Classifier,0.9571,0.6976,0.8714,0.8524,0.86,0.9163,0.9258,0.164


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [22]:
# Analyze Model

s.evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [26]:
# predictions

s.predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,speciesId,prediction_label,prediction_score
80,-1.559106,-0.334315,-1.045076,0.37879,0,0,0.9726
49,-1.099776,0.371486,-0.989455,1.045783,0,0,0.9863
27,2.100366,2.581129,0.522789,2.221406,1,1,0.9937
22,-1.380295,-0.012781,-1.042854,0.698409,0,0,0.9845
5,-0.95668,0.403053,-0.893905,1.012151,0,0,0.9762
66,-1.177097,0.12349,-0.945156,0.767886,0,0,0.9776
71,2.901147,3.572376,0.719128,3.077527,1,1,0.9981
25,-1.160536,0.153369,-0.944992,0.797616,0,0,0.9783
57,0.974777,1.209626,0.237754,1.045987,1,1,0.9621
43,-0.573012,1.028653,-0.862425,1.615506,0,0,0.9768


In [49]:
#create dummy model

dummy = create_model('dummy')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7143,0.0,0.0,0.0,0.0,0.0,0.0
1,0.7143,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5714,0.0,0.0,0.0,0.0,0.0,0.0
3,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
4,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
5,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
6,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
7,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
8,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
9,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [50]:
from pycaret.classification import create_model
import pickle

# Create Decision Tree model
dummy = create_model('dummy')

# Save the model to a file
output_file = '/content/sample_data/clef.pkl'
with open(output_file, 'wb') as f:
    pickle.dump(dt, f)
    print('Saved: %s' % output_file)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7143,0.0,0.0,0.0,0.0,0.0,0.0
1,0.7143,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5714,0.0,0.0,0.0,0.0,0.0,0.0
3,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
4,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
5,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
6,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
7,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
8,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0
9,0.5714,0.5,0.5714,0.3265,0.4156,0.0,0.0


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Saved: /content/sample_data/clef.pkl
