# resumetable()
## 0.objective
データフレームの要約。
データフレームの各要素のうち、以下がわかる  
datashape,要素,各要素のdtype,各要素の欠損数,各要素の固有値の数,各要素のエントロピー
## 1.args  
  df : dataframe
## 2.return
  summary : 上記を出力したデータフレーム

In [16]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes']) #要素×dtype
    summary = summary.reset_index() # 要素×(index,dtype)
    summary['Name'] = summary['index'] # 要素×(index,dtype,Name)
    summary = summary[['Name','dtypes']] # 要素×(Name,dtype)
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [17]:
df=pd.DataFrame({'X':['A',   'B',   'C',   'D',   'E'],
              'Y':['AA',  'BB',  'CC',  'DD',  'EE'],
              'Z':['AAA', 'BBB', 'CCC', 'DDD', 'EEE']})
summary = resumetable(df)
summary

Dataset Shape: (5, 3)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,X,object,0,5,A,B,C,2.32
1,Y,object,0,5,AA,BB,CC,2.32
2,Z,object,0,5,AAA,BBB,CCC,2.32


# reduce_mem_usage()  
## 0.Objective  
データフレームの占有メモリ領域削減
## 1.args  
dataframe
## 2.return  
dataframe

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df