-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
datasets.py
167 lines (124 loc) · 4.57 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""Module to get datasets in pycaret
"""
from typing import Optional
import requests
from pycaret.utils._dependencies import _check_soft_dependencies
def get_data(
dataset: str = "index",
folder: Optional[str] = None,
save_copy: bool = False,
profile: bool = False,
verbose: bool = True,
address: Optional[str] = None,
):
"""
Function to load sample datasets.
Order of read:
(1) Tries to read dataset from local folder first.
(2) Then tries to read dataset from folder in GitHub "address" (see below)
(3) Then tries to read from sktime (if installed)
(4) Raises error if none exist
List of available datasets on GitHub can be checked using
(1) ``get_data('index')`` or
(2) ``get_data('index', folder='time_series/seasonal)``
(see available "folder" options below)
Example
-------
>>> from pycaret.datasets import get_data
>>> all_datasets = get_data('index')
>>> juice = get_data('juice')
dataset: str, default = 'index'
Index value of dataset.
folder: Optional[str], default = None
The folder from which to get the data.
If 'None', gets it from the "common" folder. Other options are:
- time_series/seasonal
- time_series/random_walk
- time_series/white_noise
save_copy: bool, default = False
When set to true, it saves a copy in current working directory.
profile: bool, default = False
When set to true, an interactive EDA report is displayed.
verbose: bool, default = True
When set to False, head of data is not displayed.
address: string, default = None
Download url of dataset. Defaults to None which fetches the dataset from
"https://raw.githubusercontent.com/pycaret/datasets/main/". For people
having difficulty linking to github, they can change the default address
to their own
(e.g. "https://gitee.com/IncubatorShokuhou/pycaret/raw/master/datasets/")
Returns:
pandas.DataFrame
Warnings
--------
- Use of ``get_data`` requires internet connection.
Raises
------
ImportError
(1) When trying to import time series datasets that require sktime,
but sktime has not been installed.
(2) If the data does not exist
"""
import os.path
import pandas as pd
from pycaret.internal.display import CommonDisplay
extension = ".csv"
filename = str(dataset) + extension
if address is None:
root = "https://raw.githubusercontent.com/pycaret/datasets/main/"
data_dir, meta_dir = "data/", "meta/"
folder = "common" if folder is None else folder
if dataset == "index":
complete_address = root + meta_dir + folder + "/" + filename
else:
complete_address = root + data_dir + folder + "/" + filename
else:
complete_address = address + "/" + filename
sktime_datasets = ["airline", "lynx", "uschange"]
# Read the file name from local folder first
# If it does not exist, then read the file from GitHub
# If that does not exist then read sktime datasets
if os.path.isfile(filename):
data = pd.read_csv(filename)
elif requests.get(complete_address).status_code == 200:
data = pd.read_csv(complete_address)
elif dataset in sktime_datasets:
from sktime.datasets import load_airline, load_lynx, load_uschange
ts_dataset_mapping = {
"airline": load_airline,
"lynx": load_lynx,
"uschange": load_uschange,
}
data = ts_dataset_mapping.get(dataset)()
if isinstance(data, tuple):
y = data[0]
X = data[1]
data = pd.concat([y, X], axis=1)
else:
raise ValueError("Data could not be read. Please check your inputs...")
# create a copy for pandas profiler
data_for_profiling = data.copy()
if save_copy:
save_name = filename
data.to_csv(save_name, index=False)
display = CommonDisplay(
verbose=True,
html_param=True,
)
if dataset == "index":
display.display(data)
else:
if profile:
_check_soft_dependencies(
"pandas_profiling",
extra="analysis",
severity="error",
install_name="pandas-profiling",
)
import pandas_profiling
pf = pandas_profiling.ProfileReport(data_for_profiling)
display.display(pf)
else:
if verbose:
display.display(data.head())
return data