In [75]:
import pandas as pd

In [76]:
data = pd.read_csv('../data/raw/land_mines.csv')

# Basic checks and train/test split

In [77]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V       338 non-null    float64
 1   H       338 non-null    float64
 2   S       338 non-null    float64
 3   M       338 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 10.7 KB


In [78]:
data["M"] = (data["M"] != 1).astype(int)

data = data.rename(columns={"M": "mine", "V": "voltage", "H": "height", "S": "soil"})

In [79]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data["mine"])

In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 270 entries, 130 to 162
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   voltage  270 non-null    float64
 1   height   270 non-null    float64
 2   soil     270 non-null    float64
 3   mine     270 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 10.5 KB


In [81]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68 entries, 327 to 271
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   voltage  68 non-null     float64
 1   height   68 non-null     float64
 2   soil     68 non-null     float64
 3   mine     68 non-null     int64  
dtypes: float64(3), int64(1)
memory usage: 2.7 KB


In [82]:
train.to_csv('../data/interim/train.csv', index=False)
test.to_csv('../data/interim/test.csv', index=False)

# Exploration

In [83]:
train.describe()

Unnamed: 0,voltage,height,soil,mine
count,270.0,270.0,270.0,270.0
mean,0.428744,0.511448,0.486667,0.788889
std,0.195421,0.303958,0.347973,0.408855
min,0.197734,0.0,0.0,0.0
25%,0.308157,0.272727,0.2,1.0
50%,0.356495,0.545455,0.4,1.0
75%,0.478851,0.818182,0.8,1.0
max,0.999999,1.0,1.0,1.0


In [84]:
train["mine"].value_counts()

mine
1    213
0     57
Name: count, dtype: int64

In [85]:
# soil is actually categorical
train["soil"].value_counts()

soil
0.0    51
0.4    46
1.0    46
0.6    44
0.2    43
0.8    40
Name: count, dtype: int64

In [None]:
# voltage is numerical/continuous
train["voltage"].head()

130    0.352960
228    0.253776
101    0.303262
114    0.314199
86     0.999999
Name: voltage, dtype: float64

In [None]:
# height is numerical, but seems to be somewhat discrete
train["height"].value_counts()

height
0.545455    26
0.818182    26
0.181818    25
0.636364    25
0.454545    24
0.272727    24
1.000000    23
0.363636    22
0.909091    20
0.727273    19
0.090909    19
0.000000    17
Name: count, dtype: int64