## Running the program

In [1]:
!python weather.py


The program has ended correctly.



### Data processing logic

```python
if __name__ == '__main__':
    # Extracting and saving settings from a report file
    # python weather.py report/report_[...].json
    save_old_settings(sys.argv, save_to='settings_OLD.json')

    settings = get_settings('settings.json')

    # Updating we.COLUMN_DATE / we.COLUMN_CITY
    set_column(settings, we)

    report = {'preprocessing': {}}
       
    # === Working with a dataset ===
    data = load_data(settings)  # Step 0
    data = convert_dt(settings, data) # Step 1
    data = to_dropna_rows(settings, data)  # Step 2
    # data.to_pickle("data/data-2.pkl")
   
    # === Working with features ===
    features = get_features(settings)

    data = to_drop(features, data)  # Step 3
    data = to_dropna_cols(features, data)  # Step 4
    data = to_fillna(features, data)  # Step 5
    # data.to_pickle("data/data-5.pkl")
    data = to_upd_outlier(features, data)  # Step 6
    data.to_pickle("data/data-6.pkl")
    data = to_encode(features, data)  # Step 7
    data.to_pickle("data/data-7.pkl")

    report['settings'] = settings
    save_report(report, dir_name='report')

    print("\nThe program has ended correctly.\n")
```

### Information about the program

In [2]:
import weather

In [3]:
help(weather)

Help on module weather:

NAME
    weather

DESCRIPTION
    Predict next-day rain (on the target variable RainTomorrow).
    The dataset contains about 10 years of daily weather
    observations from many locations across Australia.
    
    Link: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package
    
    RainTomorrow is the target variable to predict.
    It means - did it rain the next day, Yes or No? This colum is Yes if
    the rain for that day was 1mm or more.

FUNCTIONS
    convert_dt(settings, data)
        \o/
    
    get_features(settings)
        \o/
    
    get_settings(file_name)
        \o/
    
    load_data(settings)
        \o/
    
    save_old_settings(arguments, save_to)
        \o/
    
    save_preprocessing(who_save, shape=None, info='')
        \o/
    
    save_report(report, dir_name=None)
        \o/
    
    set_column(settings, utils)
        \o/
    
    to_drop(features, data)
        \o/
    
    to_dropna_cols(features, data)
        \o/
   

### Information about the module

In [4]:
import utils_weather

In [5]:
help(utils_weather)

Help on module utils_weather:

NAME
    utils_weather - Custom utilities for working with weather data.

FUNCTIONS
    isna_stats(df, columns=None, n=None, more_than=None)
        The percentage of missing values in the dataset.
        There is a choice of columns, filter by top values / more than.
    
    notna_column(df, column, period='month', total=False)
        Table of values by columns (grouped by location and period).
        Period - year or month or month-year. And the total amount.
    
    outlier_stats(df)
        The percentage of outlier values in the dataset.
        There is a choice of columns, filter by top values.
    
    test_compare_values(df, df_test, column_name, info=False, diff=False)
        Get old data (column) and new from the dataframes.
        Additional information / difference between columns.
    
    test_fillna_column(df, column_name)
        Trying all the methods for filling NaN values in a column.
    
    test_get_group_indx(df, location, x

### Information about settings

In [7]:
import json

In [25]:
with open("settings.json", "r") as read_file:
    settings = json.load(read_file)

In [26]:
print(json.dumps(settings, indent=4))

{
    "path_to_csv": "input/weatherAUS.csv",
    "targets": {
        "1": {
            "column": "RainTomorrow",
            "models": []
        }
    },
    "dropna": {
        "status": true,
        "thresh": 5
    },
    "columns": {
        "date": "Date",
        "city": "Location"
    },
    "features": {
        "1": {
            "name": "MinTemp",
            "drop": false,
            "dropna": false,
            "fillna": "num-smart",
            "outlier": {
                "update": true,
                "method": "border"
            }
        },
        "2": {
            "name": "MaxTemp",
            "drop": false,
            "dropna": false,
            "fillna": "num-smart",
            "outlier": {
                "update": true,
                "method": "border"
            }
        },
        "3": {
            "name": "Rainfall",
            "drop": false,
            "dropna": false,
            "fillna": "num-smart",
            "outlier": {
            

### Information about reports

In [18]:
with open("report/report_2021-06-22___01-01-35.json", "r") as read_file:
    report = json.load(read_file)

In [27]:
print(json.dumps(report, indent=4))

{
    "preprocessing": {
        "load_data": {
            "time": "01:01:32 / 633486",
            "rows": 145460,
            "cols": 23,
            "info": "Size 3345580 / Missing values 343248"
        },
        "convert_dt": {
            "time": "01:01:32 / 692351",
            "rows": 145460,
            "cols": 23,
            "info": "Dtype datetime64[ns]"
        },
        "to_dropna_rows": {
            "time": "01:01:32 / 865391",
            "rows": 145167,
            "cols": 23,
            "info": false
        },
        "to drop": {
            "time": "01:01:32 / 913926",
            "rows": 145167,
            "cols": 19,
            "info": "Drop: Evaporation, Sunshine, Cloud9am, Cloud3pm"
        },
        "to_dropna_cols": {
            "time": "01:01:32 / 913988",
            "rows": 145167,
            "cols": 19,
            "info": ""
        },
        "to_fillna": {
            "time": "01:01:33 / 221002",
            "rows": 145167,
            "cols"