In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('./DATA/forestcleaned.csv')

In [6]:
data = data.drop(columns=['Unnamed: 0'])

In [7]:
data.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,month,day
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,3,5
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,10,2
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,10,6
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,3,5
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,3,7


In [8]:
duplicates = data.duplicated().sum()

In [9]:
duplicates

4

In [10]:
missing_values = data_cleaned.isnull().sum()

In [11]:
missing_values

X        0
Y        0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
month    0
day      0
dtype: int64

In [12]:
data_types = data.dtypes

In [13]:
data_types

X          int64
Y          int64
FFMC     float64
DMC      float64
DC       float64
ISI      float64
temp     float64
RH         int64
wind     float64
rain     float64
area     float64
month      int64
day        int64
dtype: object

In [14]:
data = data.drop_duplicates()

In [16]:
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()

In [17]:
numerical_features

['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area']

In [20]:
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [21]:
z_scores = np.abs(stats.zscore(data[numerical_features]))

In [22]:
z_scores

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,1.001657,0.569075,0.801628,1.322179,1.837925,0.858350,1.835573,0.406971,1.498430,0.073555,0.201967
1,1.001657,0.244116,0.006761,1.178717,0.484659,0.508323,0.152432,0.694428,1.733018,0.073555,0.201967
2,1.001657,0.244116,0.006761,1.049290,0.556583,0.508323,0.736379,0.694428,1.510159,0.073555,0.201967
3,1.433101,1.382265,0.191956,1.211464,1.905808,0.005160,1.818399,3.221658,0.005865,0.600261,0.201967
4,1.433101,1.382265,0.241608,0.930777,1.806003,0.126100,1.285976,3.344035,1.231586,0.073555,0.201967
...,...,...,...,...,...,...,...,...,...,...,...
512,0.292676,1.057306,1.632626,0.846571,0.470516,1.558403,1.530710,0.755617,0.730155,0.073555,0.101074
513,1.155564,0.244116,1.632626,0.846571,0.470516,1.558403,0.517390,1.630748,0.996998,0.073555,0.648570
514,1.001657,0.244116,1.632626,0.846571,0.470516,1.558403,0.397166,1.569559,1.498430,0.073555,0.027128
515,1.587008,0.244116,0.679715,0.545947,0.264846,0.498003,1.152862,0.143728,0.005865,0.073555,0.201967


In [23]:
outliers = (z_scores > 3).sum()

In [25]:
threshold = 3
outlier_mask = (z_scores > threshold).any(axis=1)
outliers = data[outlier_mask]

In [28]:
outliers.loc[:, 'num_outliers'] = (z_scores[outlier_mask] > threshold).sum(axis=1)

In [29]:
outliers

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,month,day,num_outliers
3,1.433101,1.382265,0.191956,-1.211464,-1.905808,-0.00516,-1.818399,3.221658,-0.005865,0.600261,-0.201967,3,5,1
4,1.433101,1.382265,-0.241608,-0.930777,-1.806003,0.1261,-1.285976,3.344035,-1.231586,-0.073555,-0.201967,3,7,1
12,0.570213,0.569075,-4.902419,-0.6267,0.469304,-1.799046,-0.324181,1.691937,1.49843,-0.073555,-0.201967,8,5,1
22,1.001657,-0.244116,0.66165,-0.22906,-1.410825,10.298748,0.362816,-0.02135,0.272708,-0.073555,-0.201967,6,7,1
75,1.864545,3.821836,-1.162931,-1.624697,-2.111479,-0.289557,-2.093197,2.120259,-0.507296,-0.073555,-0.201967,2,4,1
76,1.864545,3.821836,-0.729368,-1.524898,-2.045212,-0.814596,-0.547455,-0.082539,-0.507296,-0.073555,-0.201967,2,5,1
97,-0.72412,-0.244116,-3.908836,-1.69331,-2.15633,-1.820923,-0.255481,-1.245127,0.77414,-0.073555,-0.201967,3,6,1
130,-0.292676,1.382265,-4.053357,-1.39547,-1.866614,-1.799046,-0.598979,-0.266106,-0.730155,-0.073555,-0.201967,2,6,1
138,1.864545,3.821836,-0.873889,-0.977558,-0.952611,-1.12087,-0.152432,-0.143728,-0.730155,-0.073555,-0.196327,7,2,1
151,1.864545,3.821836,-0.097087,-0.661006,-0.783711,-0.39894,1.015463,-0.939183,-1.008728,-0.073555,-0.18066,7,7,1


In [35]:
outliers = outliers.drop(index=379)

In [36]:
len(outliers)

29

In [42]:
df_cleaned = data[~outlier_mask].copy()

  df_cleaned = data[~outlier_mask].copy()


In [43]:
df_cleaned.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,month,day
0,1.001657,0.569075,-0.801628,-1.322179,-1.837925,-0.85835,-1.835573,0.406971,1.49843,-0.073555,-0.201967,3,5
1,1.001657,-0.244116,-0.006761,-1.178717,0.484659,-0.508323,-0.152432,-0.694428,-1.733018,-0.073555,-0.201967,10,2
2,1.001657,-0.244116,-0.006761,-1.04929,0.556583,-0.508323,-0.736379,-0.694428,-1.510159,-0.073555,-0.201967,10,6
5,1.433101,1.382265,0.300346,-0.400591,-0.247109,1.24181,0.568915,-0.939183,0.77414,-0.073555,-0.201967,8,7
6,1.433101,1.382265,0.300346,-0.344454,-0.216399,-0.114543,0.895238,-1.061561,-0.507296,-0.073555,-0.201967,8,1


483

0.9342359767891683