In [None]:
import os
import pandas as pd
import joblib as jb
from tqdm.auto import tqdm, trange
import gc

# Dataset

In [None]:
def get_N(folder = "../input/us-weather-daily-summaries-1991-2016/data", filename = "weather", ext = "csv"):
    
    """
    Get total number of batches files present in the directory.
    """
    
    N = 0
    while True:
        if os.path.isfile("{}/{}_{}.{}".format(folder, filename, N, ext)):
            N += 1
        else:
            break
    
    return N


def read_df(i_min, i_max, folder = "../input/us-weather-daily-summaries-1991-2016/data", filename = "weather", ext = "csv", n_jobs=-1, progress=True):
    
    """
    Load a part of batches file in one dataframe.
    """
    
    df = jb.Parallel(n_jobs=n_jobs)(
        jb.delayed(pd.read_csv)("{}/{}_{}.{}".format(folder, filename, i, ext))
        for i in tqdm(range(i_min,i_max), desc="Loading dataset", disable=not progress)
    )
    df = pd.concat(df, axis=0, ignore_index=True)
    
    return df

In [None]:
N = get_N()
print("Number of batches files: {}".format(N))

Avoid reading the entire dataset because notebook will try to allocate more memory than the available.

In [None]:
df = read_df(0,50)
display(df.head())
display(df.info())

# Preprocessing

In [None]:
def get_null_matrix(path):
    df = pd.read_csv(path)
    null_matrix = df.groupby("STATION").apply(lambda col: col.isnull().any())
    del df; gc.collect()
    return null_matrix


def get_batches_null_matrix(folder="../input/us-weather-daily-summaries-1991-2016/data", filename="weather", ext="csv", n_jobs=-1, progress=True):

    N = get_N(folder=folder, filename=filename, ext=ext)
    
    null_matrix = jb.Parallel(n_jobs=n_jobs)(
        jb.delayed(get_null_matrix)("{}/{}_{}.{}".format(folder,filename,i,ext))
        for i in trange(N, desc="Null matrix", disable=not progress)
    )
    null_matrix = pd.concat(null_matrix, axis=0)
    
    gc.collect()
    return null_matrix


null_matrix = get_batches_null_matrix()
display(null_matrix.head())
display(null_matrix.info())

In [None]:
drop_cols = null_matrix.all().loc[lambda x: x].index.tolist()
print("Columns containing only missing values: {}/{} ({:.0%})".format(len(drop_cols), null_matrix.shape[1], len(drop_cols)/null_matrix.shape[1]))
print(drop_cols)