<h1>Phase 1<br>Data Preprocessing
</h1>(4 steps)

<h3>Step 1 - Loading Data
</h3>

In [48]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.inspection import permutation_importance

In [49]:
df = pd.read_csv('deforestation_dataset.csv')
print(df.head())

     Country  Year  Forest_Loss_Area_km2  Tree_Cover_Loss_percent  \
0  Indonesia  1971                   560                 8.929641   
1     Brazil  1927                  3303                 4.638441   
2     Russia  1961                  4466                 4.679313   
3  Australia  1967                  3658                 1.535528   
4  Australia  1987                  2682                 8.035841   

   CO2_Emission_mt  Rainfall_mm  Population  GDP_Billion_USD  \
0              304  1635.715350    86759840      2551.805035   
1              341  1454.430241    83798502      2637.895996   
2              298  1744.809660    41477592      2880.724721   
3              285  1541.645853    71475964      2525.516988   
4              450  1752.997736    16256333       608.916586   

   Agriculture_Land_Percent  Deforestation_Policy_Strictness  \
0                 59.316366                                3   
1                 14.211099                                4   
2       

In [50]:
print(df.isnull().sum())

Country                            0
Year                               0
Forest_Loss_Area_km2               0
Tree_Cover_Loss_percent            0
CO2_Emission_mt                    0
Rainfall_mm                        0
Population                         0
GDP_Billion_USD                    0
Agriculture_Land_Percent           0
Deforestation_Policy_Strictness    0
Corruption_Index                   0
International_Aid_Million_USD      0
Illegal_Lumbering_Incidents        0
Protected_Areas_Percent            0
dtype: int64


In [51]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          100 non-null    object 
 1   Year                             100 non-null    int64  
 2   Forest_Loss_Area_km2             100 non-null    int64  
 3   Tree_Cover_Loss_percent          100 non-null    float64
 4   CO2_Emission_mt                  100 non-null    int64  
 5   Rainfall_mm                      100 non-null    float64
 6   Population                       100 non-null    int64  
 7   GDP_Billion_USD                  100 non-null    float64
 8   Agriculture_Land_Percent         100 non-null    float64
 9   Deforestation_Policy_Strictness  100 non-null    int64  
 10  Corruption_Index                 100 non-null    float64
 11  International_Aid_Million_USD    100 non-null    int64  
 12  Illegal_Lumbering_Incid

In [52]:
print(df.describe(include='all'))

       Country         Year  Forest_Loss_Area_km2  Tree_Cover_Loss_percent  \
count      100   100.000000            100.000000               100.000000   
unique       5          NaN                   NaN                      NaN   
top     Russia          NaN                   NaN                      NaN   
freq        24          NaN                   NaN                      NaN   
mean       NaN  1973.900000           2402.040000                 5.581324   
std        NaN    30.521561           1289.357713                 2.486552   
min        NaN  1925.000000            503.000000                 1.535528   
25%        NaN  1946.750000           1288.500000                 3.409892   
50%        NaN  1972.500000           2159.000000                 5.540553   
75%        NaN  1997.250000           3495.500000                 7.642558   
max        NaN  2023.000000           4949.000000                 9.791851   

        CO2_Emission_mt  Rainfall_mm    Population  GDP_Billion

<h3>Step 2 - Data Cleaning
</h3>

In [53]:
df

Unnamed: 0,Country,Year,Forest_Loss_Area_km2,Tree_Cover_Loss_percent,CO2_Emission_mt,Rainfall_mm,Population,GDP_Billion_USD,Agriculture_Land_Percent,Deforestation_Policy_Strictness,Corruption_Index,International_Aid_Million_USD,Illegal_Lumbering_Incidents,Protected_Areas_Percent
0,Indonesia,1971,560,8.929641,304,1635.715350,86759840,2551.805035,59.316366,3,9.426264,238,184,7.005531
1,Brazil,1927,3303,4.638441,341,1454.430241,83798502,2637.895996,14.211099,4,2.602618,418,78,20.044415
2,Russia,1961,4466,4.679313,298,1744.809660,41477592,2880.724721,44.869699,2,51.917315,186,49,22.747603
3,Australia,1967,3658,1.535528,285,1541.645853,71475964,2525.516988,10.824516,4,23.716328,190,2,22.701362
4,Australia,1987,2682,8.035841,450,1752.997736,16256333,608.916586,14.577190,4,21.424037,159,41,18.085869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Australia,2005,1809,1.544935,93,1893.986221,29915949,3312.288237,34.401813,2,22.860613,291,95,24.221198
96,Australia,2001,2017,4.264310,440,1561.190914,81849918,4673.534282,49.675052,2,7.573423,428,7,14.284021
97,Australia,1981,3960,2.316448,288,1442.880729,25174137,377.178732,22.751577,4,85.822326,265,142,5.259525
98,Australia,1973,2466,6.562127,174,1671.741142,12523167,517.529578,17.310985,2,37.465179,231,195,6.211250


In [54]:
# Handle missing values: for simplicity, fill numeric columns with their median and categorical with mode
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

# Convert categorical columns to numeric
# 'Deforestation_Policy_Strictness' and 'Corruption_Index' are already numeric based on info
# If there are other object-type columns (e.g., 'Country'), use one-hot encoding
if df['Country'].dtype == 'object':
    df = pd.get_dummies(df, columns=['Country'], drop_first=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [55]:
df

Unnamed: 0,Year,Forest_Loss_Area_km2,Tree_Cover_Loss_percent,CO2_Emission_mt,Rainfall_mm,Population,GDP_Billion_USD,Agriculture_Land_Percent,Deforestation_Policy_Strictness,Corruption_Index,International_Aid_Million_USD,Illegal_Lumbering_Incidents,Protected_Areas_Percent,Country_Brazil,Country_India,Country_Indonesia,Country_Russia
0,1971,560,8.929641,304,1635.715350,86759840,2551.805035,59.316366,3,9.426264,238,184,7.005531,False,False,True,False
1,1927,3303,4.638441,341,1454.430241,83798502,2637.895996,14.211099,4,2.602618,418,78,20.044415,True,False,False,False
2,1961,4466,4.679313,298,1744.809660,41477592,2880.724721,44.869699,2,51.917315,186,49,22.747603,False,False,False,True
3,1967,3658,1.535528,285,1541.645853,71475964,2525.516988,10.824516,4,23.716328,190,2,22.701362,False,False,False,False
4,1987,2682,8.035841,450,1752.997736,16256333,608.916586,14.577190,4,21.424037,159,41,18.085869,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2005,1809,1.544935,93,1893.986221,29915949,3312.288237,34.401813,2,22.860613,291,95,24.221198,False,False,False,False
96,2001,2017,4.264310,440,1561.190914,81849918,4673.534282,49.675052,2,7.573423,428,7,14.284021,False,False,False,False
97,1981,3960,2.316448,288,1442.880729,25174137,377.178732,22.751577,4,85.822326,265,142,5.259525,False,False,False,False
98,1973,2466,6.562127,174,1671.741142,12523167,517.529578,17.310985,2,37.465179,231,195,6.211250,False,False,False,False


In [56]:
print(df.isnull().sum())

Year                               0
Forest_Loss_Area_km2               0
Tree_Cover_Loss_percent            0
CO2_Emission_mt                    0
Rainfall_mm                        0
Population                         0
GDP_Billion_USD                    0
Agriculture_Land_Percent           0
Deforestation_Policy_Strictness    0
Corruption_Index                   0
International_Aid_Million_USD      0
Illegal_Lumbering_Incidents        0
Protected_Areas_Percent            0
Country_Brazil                     0
Country_India                      0
Country_Indonesia                  0
Country_Russia                     0
dtype: int64


<h3>Step 3 - Feature Scaling
</h3>

In [57]:
# Select numerical features for scaling (excluding target and one-hot columns)
num_features = [
    'Year', 'Forest_Loss_Area_km2', 'Tree_Cover_Loss_percent', 'CO2_Emission_mt',
    'Rainfall_mm', 'Population', 'GDP_Billion_USD', 'Agriculture_Land_Percent',
    'Deforestation_Policy_Strictness', 'Corruption_Index', 'International_Aid_Million_USD',
    'Illegal_Lumbering_Incidents', 'Protected_Areas_Percent'
]

scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[num_features] = scaler.fit_transform(df[num_features])

# Example: Assume 'Country_Brazil' is the target for demonstration
X = df_scaled[num_features + ['Country_India', 'Country_Indonesia', 'Country_Russia']]
y = df['Country_Brazil']

# Train a simple SVM model
svm = SVC(kernel='linear')
svm.fit(X, y)

# Feature importance using permutation importance
result = permutation_importance(svm, X, y, n_repeats=10, random_state=42)
importances = pd.Series(result.importances_mean, index=X.columns)
importances = importances.sort_values(ascending=False)
print("Feature importances (permutation importance):")
print(importances)

# Optionally, select top features if needed
top_features = importances[importances > 0].index.tolist()
print("Top features selected:", top_features)

Feature importances (permutation importance):
Country_Indonesia                  0.162
Country_Russia                     0.121
Country_India                      0.100
International_Aid_Million_USD      0.030
GDP_Billion_USD                    0.027
Tree_Cover_Loss_percent            0.006
Forest_Loss_Area_km2               0.000
CO2_Emission_mt                   -0.001
Illegal_Lumbering_Incidents       -0.002
Corruption_Index                  -0.005
Protected_Areas_Percent           -0.007
Population                        -0.007
Deforestation_Policy_Strictness   -0.008
Agriculture_Land_Percent          -0.009
Year                              -0.012
Rainfall_mm                       -0.013
dtype: float64
Top features selected: ['Country_Indonesia', 'Country_Russia', 'Country_India', 'International_Aid_Million_USD', 'GDP_Billion_USD', 'Tree_Cover_Loss_percent']


<h3>Step 4 - Split Data into Training and Testing Sets
</h3>

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (80, 16)
Testing set size: (20, 16)


<h1>Phase 2<br>Model Building and Evaluation
</h1>(3 steps)