# Importing necessary modules.

In [2]:
import os
import json
import pandas
import cleaning_api

# Installing the data cleaning service. (just for the first usage time)
If it is the first time that you want to run the data cleaning service on your system, you need to consider the following instructions:
1. Make sure that you have installed
    - Linux (Ubuntu/Debian recommended)
    - Python 2.7
    - Oracle Java 1.8
    - Apache Ant 1.8.2+
    - PostgreSQL 9.2+
2. Call the following method to install data cleaing tools. 

In [3]:
cleaning_api.install_tools()

dBoost is installed.
KATARA is installed.
To configure NADEEF, please follow the following steps:
1. Create a database entitled 'nadeef' in the postgres.
2. Inter your postgres username: postgres
3. Inter your postgres password: 1234
NADEEF is installed.


# Entering path of input and output JSON files.
For example,
- As input: sources.json
- As output: destination.json

In [5]:
input_file_path = raw_input("Please enter path of input JSON file: ")
output_file_path = raw_input("Please enter path of output JSON file: ")

Please enter path of input JSON file: sources.json
Please enter path of output JSON file: destination.json


# Reading the input and output JSON files.

In [6]:
input_dictionary = json.load(open(input_file_path, "r"))
input_folder = input_dictionary["CSV"]["dir"]
if input_dictionary["CSV"]["table"]:
    input_tables = input_dictionary["CSV"]["table"].split(";")
else:
    input_tables = os.listdir(input_folder)
output_dictionary = json.load(open(output_file_path, "r"))
output_folder = output_dictionary["CSV"]["dir"]
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

# Iterating over the input dataset tables and running data cleaning tools.
Available data cleaning tools are:
- **dboost**
    - Recommended parameters for dboost are:
        - ["gaussian", "2"]
        - ["histogram", "0.8", "0.2"]
        - ["mixture", "2", "0.01"]
        - ["partitionedhistogram", "10", "0.8", "0.1"]    
- **nadeef**
    - Proper parameter for nadeef is list of existing functional dependencies in the dataset. For example,
    
    [["A", "B"], ["X, Y", "Z"]]
    
    means that the functional dependencies A -> B and X, Y -> Z exist among the attributes A, B, X, Y, and Z in the dataset.
- **openrefine**
    - Proper parameter for the openrefine is list of existing patterns for different attributes of the dataset. For example,
    
    [["A", "`^[\d]{3}$`"], ["B", "`^[A-Za-z]+$`"]]
    
    means that the attribute A must have only 3-digit values and the attribute B must have only alphabetical values.

In [14]:
for table in input_tables:
    dataset_path = os.path.join(input_folder, table)
    table_data = cleaning_api.read_csv_dataset(dataset_path)
    print "The dataset's atributes are:"
    print table_data[0]
    # df = pandas.DataFrame(data=table_data[1:], columns=table_data[0])
    # print df.head()[["price", "^[\d]+$"], ["brand_name", "^[\w]+$"]]
    tool_name = raw_input("Please enter the name of data cleaning tool: ")
    exec("tool_parameters = " + raw_input("Please enter the proper parameter list: "))
    run_input = {
        "dataset": {
            "type": "csv",
            "param": [dataset_path]
        },
        "tool": {
            "name": tool_name,
            "param": tool_parameters
            }
    }
    results_list = cleaning_api.run_data_cleaning_job(run_input)
    result_path = os.path.join(output_folder, table)
    cleaning_api.write_csv_dataset(result_path, [["row", "column", "value"]] + results_list)
    print "The results have been written into the {}\n--------------------------------------".format(result_path)

The dataset's atributes are:
[u'title', u'url', u'model', u'mileage', u'price', u'contact_number', u'vtype', u'brand_name']
Please enter the name of data cleaning tool: nadeef
Please enter the proper parameter list: [["title", "brand_name"]]
The results have been written into the outputs/sample.csv
--------------------------------------
