# Automatic Data Insights
The fastest way to improve your data


**What is DQ auto?**

dq.auto is a helper function to train the most cutting-edge transformer (or any of your choosing from HuggingFace) on your dataset so it can be processed by Galileo. You provide the data, let Galileo train the model, and provide you with data quality insights

Requirements:
```python
pip install dataquality
```

In [None]:
%%capture
%pip install dataquality
%pip install evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/222.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.3/222.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h

# Get Started
First we are going to import dataquality and create our train, test, validation and inference Dataframes.

In [None]:
# 🔭🌕 Galileo logging
import dataquality as dq
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
# Load the newsgroups dataset from sklearn
newsgroups_ds = fetch_20newsgroups(subset='all')
# Convert to pandas dataframes and split
df = pd.DataFrame({"text": newsgroups_ds.data, "label": newsgroups_ds.target})
df_train,df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=1)
df_val, df_inf = train_test_split(df_val, test_size=0.2, random_state=1)
df_inf_1, df_inf_2 = train_test_split(df_inf, test_size=0.5, random_state=1)

# Run ```dq.auto``` for insights on text classification
Simply run dq.auto with pandas dataframes and you are ready to go.

Training takes around 14 minutes in Google Colab with early stopping.

In [None]:
# 🔭🌕 Galileo logging
dq.auto(
     train_data=df_train, 
     test_data=df_test,
     val_data=df_val,
     inference_data={"inference_test":df_inf},
     labels=newsgroups_ds.target_names,
     project_name="newsgroups_work", 
     run_name="run_1_raw_data"
)

# Running inference on an existing model with Trainer
First we need to load the model and tokenizer from HuggingFace.
Then we can run the trainer with the model for inference.
To load the last checkpoint we will check the output folder for the last checkpoint and load it.

In [None]:
!ls finetuned

checkpoint-189	checkpoint-378	checkpoint-567	runs


In [None]:
# 🔭🌕 Galileo logging
import dataquality as dq
from dataquality.integrations.transformers_trainer import watch
from datasets import Dataset
from transformers import Trainer, AutoModelForSequenceClassification, AutoTokenizer
# Convert to pandas dataframes and split
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
newsgroups_ds = fetch_20newsgroups(subset='all')
df = pd.DataFrame({"text": newsgroups_ds.data, "label": newsgroups_ds.target}).sample(2000)
df_train,df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=1)
df_val, df_inf = train_test_split(df_val, test_size=0.2, random_state=1)

# Local model
model_name = "./finetuned/checkpoint-180"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
ds = Dataset.from_pandas(df.sample(200))
# 🔭🌕 Galileo logging: Add id column to dataset
ds = ds.map(lambda x,idx: {"id":idx},with_indices=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

ds = ds.map(tokenize_function, batched=True)
# 🔭🌕 Galileo logging: Init project
dq.init(
    task_type="text_classification",
    project_name="newsgroups_work", 
    run_name="run_1_raw_data")
# 🔭🌕 Galileo logging: Log inference dataset
dq.log_dataset(ds,split="inference", inference_name="inference_foo_bar")
# 🔭🌕 Galileo logging: Set labels
dq.set_labels_for_run(labels=newsgroups_ds.target_names)
trainer = Trainer(model)
# 🔭🌕 Galileo logging: Watch trainer
watch(trainer)
# 🔭🌕 Galileo logging: Set split and start predicting
dq.set_split("inference", inference_name="inference_foo_bar")
preds = trainer.predict(test_dataset=ds)
# 🔭🌕 Galileo logging: Finish run
dq.finish()