# AutoML with MLjar

[Official website](https://mljar.com/automl)

In [None]:
# import the relevant packages
import pandas as pd

from supervised.automl import AutoML

## Preprocessing the data

### Training dataset

In [None]:
# load the dataset
train_data = pd.read_csv("../data/Census_income_train.csv")

In [None]:
# inspect the dataset
# all columns are features, except the 'income' column, which will be the target
train_data.head()

In [None]:
train_data.shape

In [None]:
# No null or NaN values
train_data.isnull().sum()

#### Removing rows with unknown values ('?')

In [None]:
# All missing or unknow values, however, are marked with a question mark (?)
# There are 3 columns which contain '?' - Workclass, Occupation, Native-country

In [None]:
# Let's start with the Workclass column
# We can obtain a list of boolean values indicating whether there is a '?' on the current row
train_data["Workclass"].str.contains("\?")

In [None]:
# Let's reverse all the boolean values
train_data["Workclass"].str.contains("\?") == False

In [None]:
# Take the subset of the dataframe rows which don't contain '?'
clean_train_data = train_data[train_data["Workclass"].str.contains("\?") == False]

In [None]:
len(clean_train_data)

In [None]:
# Let's do the same for 'Occupation'
clean_train_data = clean_train_data[clean_train_data["Occupation"].str.contains("\?") == False]

In [None]:
len(clean_train_data)

In [None]:
# And for 'Native-country'
clean_train_data = clean_train_data[clean_train_data["Native-country"].str.contains("\?") == False]

In [None]:
len(clean_train_data)

In [None]:
# Finally, let's reset the index
clean_train_data = clean_train_data.reset_index(drop=True)

#### Creating dummy variables and separating inputs and targets

In [None]:
# In the original data, there are both categorical and numerical data
# Decision trees and random forest can work with categorical data in general
# However, this is not implemented in sklearn
# So, we need to convert the categorical data to numerical
# We will do that with one hot encoding

In [None]:
# Pandas can automatically do that for us with '.get_dummies'
train_dummies = pd.get_dummies(clean_train_data)

In [None]:
train_dummies.head()

In [None]:
# The last 2 columns are whether the income <= 50k and whether it is >50k
# Both of these carry the same information, so we will remove one of them
train_dummies = train_dummies.drop(['Income_ <=50K'],axis=1)

In [None]:
train_dummies.head()

In [None]:
# The input features are everything besides the last column
train_input = train_dummies.iloc[:,:-1]

# The target/output is just the last column
train_target = train_dummies.iloc[:,-1]

In [None]:
train_input.head()

In [None]:
train_target.head()

### Test dataset

In [None]:
# Let's do the same preprocessing on the test dataset

In [None]:
# Load test data
test_data = pd.read_csv("../data/Census_income_test.csv")

In [None]:
test_data.head()

In [None]:
len(test_data)

#### Cleaning unknown ('?') values

In [None]:
clean_test_data = test_data[test_data["Workclass"].str.contains("\?") == False]

In [None]:
len(clean_test_data)

In [None]:
clean_test_data = clean_test_data[clean_test_data["Occupation"].str.contains("\?") == False]

In [None]:
len(clean_test_data)

In [None]:
clean_test_data = clean_test_data[clean_test_data["Native-country"].str.contains("\?") == False]

In [None]:
len(clean_test_data)

In [None]:
clean_test_data = clean_test_data.reset_index(drop=True)

#### Creating dummy variables and separating inputs and targets

In [None]:
test_dummies = pd.get_dummies(clean_test_data)

In [None]:
test_dummies.head()

In [None]:
test_dummies = test_dummies.drop(['Income_ <=50K.'],axis=1)

In [None]:
test_dummies.head()

In [None]:
test_input = test_dummies.iloc[:,:-1]
test_target = test_dummies.iloc[:,-1]

In [None]:
test_target.head()

## Running MLjar

In [None]:
%%time
automl = AutoML(mode='Explain')
automl.fit(train_input, train_target)
predictions = automl.predict(test_input)