-
Notifications
You must be signed in to change notification settings - Fork 9
/
data.py
103 lines (87 loc) · 3.07 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""Group a dataframe by formula (preserving indices) and extract data from module."""
# retrieve static file from package: https://stackoverflow.com/a/20885799/13697228
from importlib.resources import open_text
import pandas as pd
from sklearn.model_selection import train_test_split
def groupby_formula(df, how="max"):
"""Group identical compositions together and preserve original indices.
See https://stackoverflow.com/a/49216427/13697228
Parameters
----------
df : DataFrame
At minimum should contain "formula" and "target" columns.
how : str, optional
How to perform the "groupby", either "mean" or "max", by default "max"
Returns
-------
DataFrame
The grouped DataFrame such that the original indices are preserved.
"""
grp_df = (
df.reset_index()
.groupby(by="formula")
.agg({"index": lambda x: tuple(x), "target": "max"})
.reset_index()
)
grp_df["count"] = grp_df["index"].apply(len)
return grp_df
def data(
module,
fname="train.csv",
groupby=True,
dummy=False,
split=True,
val_size=0.2,
test_size=0.0,
random_state=42,
):
"""Grab data from within the subdirectories (modules) of mat_discover.
Parameters
----------
module : Module
The module within mat_discover that contains e.g. "train.csv". For example,
from crabnet.data.materials_data import elasticity
fname : str, optional
Filename of text file to open.
dummy : bool, optional
Whether to pare down the data to a small test set, by default False
groupby : bool, optional
Whether to use groupby_formula to filter identical compositions
split : bool, optional
Whether to split the data into train, val, and (optionally) test sets, by default True
val_size : float, optional
Validation dataset fraction, by default 0.2
test_size : float, optional
Test dataset fraction, by default 0.0
random_state : int, optional
seed to use for the train/val/test split, by default 42
Returns
-------
DataFrame
If split==False, then the full DataFrame is returned directly
DataFrame, DataFrame
If test_size == 0 and split==True, then training and validation DataFrames are returned.
DataFrame, DataFrame, DataFrame
If test_size > 0 and split==True, then training, validation, and test DataFrames are returned.
"""
train_csv = open_text(module, fname)
df = pd.read_csv(train_csv)
if groupby:
df = groupby_formula(df, how="max")
if dummy:
ntot = min(100, len(df))
df = df.head(ntot)
if split:
if test_size > 0:
df, test_df = train_test_split(
df, test_size=test_size, random_state=random_state
)
train_df, val_df = train_test_split(
df, test_size=val_size / (1 - test_size), random_state=random_state
)
if test_size > 0:
return train_df, val_df, test_df
else:
return train_df, val_df
else:
return df