In [1]:
from basics import missing_rate, zero_rate, dtypes_class
from basics import feature_variance, freq_items_df

import pandas as pd

d = {'A': [1, 0, None, 3],
     'B': [1, 0, 0, 0],
     'C': ['a', None, 'c', 'd']}

# create DataFrame
df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,1,a
1,0.0,0,
2,,0,c
3,3.0,0,d


In [2]:
pd.__version__

'0.24.1'

In [3]:
df.B.mode(2)

0    0
dtype: int64

In [4]:
print(missing_rate(df))

  feature  missing_rate
0       A          0.25
1       B          0.00
2       C          0.25


In [5]:
print(zero_rate(df))

  feature  zero_rate
0       A   0.333333
1       B   0.750000
2       C   0.000000


In [6]:
(num_fields, cat_fields, bool_fields, data_types, data_class) = dtypes_class(df)

print(num_fields)
print(cat_fields)
print(bool_fields)
print(data_types)
print(data_class)

['A', 'B']
['C']
[]
  feature   dtypes
0       A  float64
1       B    int64
2       C   object
  feature   dtypes     class
0       A  float64   numeric
1       B    int64   numeric
2       C   object  category


In [7]:
data_class.groupby('class').count().reset_index()

Unnamed: 0,class,feature,dtypes
0,category,1,1
1,numeric,2,2


In [8]:
feature_variance(df)

Unnamed: 0,feature,feature_variance
0,A,1.0
1,B,0.5
2,C,1.0


In [9]:
d ={
    'num': list('1223334444'),
    'cat': list('wxxyyyzzzz')
}
df = pd.DataFrame(d)
df = df.astype({"num": int, "cat": object})

In [10]:
df

Unnamed: 0,num,cat
0,1,w
1,2,x
2,2,x
3,3,y
4,3,y
5,3,y
6,4,z
7,4,z
8,4,z
9,4,z


In [11]:
df.dtypes

num     int64
cat    object
dtype: object

In [12]:
df = pd.read_csv('Heart.csv')
df.head(5)

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [13]:
test = freq_items_df(df, top_n=4)
test

Unnamed: 0,feature,top_items,top_freqs
0,Age,"[58, 57, 54, 59]","[19, 17, 16, 14]"
1,Sex,"[1, 0]","[206, 97]"
2,ChestPain,"[asymptomatic, nonanginal, nontypical, typical]","[144, 86, 50, 23]"
3,RestBP,"[120, 130, 140, 110]","[37, 36, 32, 19]"
4,Chol,"[197, 234, 204, 212]","[6, 6, 6, 5]"
5,Fbs,"[0, 1]","[258, 45]"
6,RestECG,"[0, 2, 1]","[151, 148, 4]"
7,MaxHR,"[162, 160, 163, 152]","[11, 9, 9, 8]"
8,ExAng,"[0, 1]","[204, 99]"
9,Oldpeak,"[0.0, 1.2, 1.0, 0.6]","[99, 17, 14, 14]"


In [14]:
#test.to_csv('test.csv')

In [15]:
def feature_len(df_in):
    def fea_len(f):
        temp = f.map(lambda x: len(str(x)))
        return temp.min(),temp.max()
    
    temp = df_in.apply(fea_len)
    d = {'feature':  df_in.columns,
         'min_length': [i[0] for i in temp],
         'max_length': [i[1] for i in temp]}
    return pd.DataFrame(d)

In [16]:
d = {'A': [1, 0, None, 3],
     'B': [1, 0, 0, 0],
     'C': ['a', None, 'c', 'd']}

# create DataFrame
df = pd.DataFrame(d)

In [17]:
feature_len(df)

Unnamed: 0,feature,min_length,max_length
0,A,3,3
1,B,1,1
2,C,1,4


In [18]:
df

Unnamed: 0,A,B,C
0,1.0,1,a
1,0.0,0,
2,,0,c
3,3.0,0,d


In [19]:
df.A.map(lambda x: len(str(x)))

0    3
1    3
2    3
3    3
Name: A, dtype: int64

In [40]:
import numpy as np

def percentiles_df(df_in, deciles = False):

    if deciles:
        percentiles = np.array(range(0, 110, 10))
    else:
        percentiles = [25, 50, 75]

    def deciles(f):
        return  np.percentile(f[f.notnull()], percentiles)

    temp = df_in.apply(deciles)
    d = {'feature':  df_in.columns,
         'percentiles': [i[0] for i in temp]}
    return pd.DataFrame(d)
    

In [43]:
df = df[['A','B']]
df

Unnamed: 0,A,B
0,1.0,1
1,0.0,0
2,,0
3,3.0,0


In [86]:
def numeric_summary(df_in, deciles = False):
    if deciles:
        var_name = 'deciles'
        percentiles = np.array(range(0, 110, 10))
    else:
        var_name = 'percentiles'
        percentiles = [25, 50, 75]

    def deciles(f):
        return f.min(),\
               np.percentile(f[f.notnull()], percentiles),\
               f.max(), \
               f.mean(),\
               f.std(), \
               f.mean() - 1.96*f.std()/np.sqrt(f.count()),\
               f.mean() + 1.96*f.std()/np.sqrt(f.count()),\
               f.sum()


    temp = np.transpose(df_in.apply(deciles))
    
    d = {'feature':  df_in.columns,
         'min': [i[0] for i in temp],
         var_name: [i[1] for i in temp],
         'max': [i[2] for i in temp],
         'mean': [i[3] for i in temp],
         'std': [i[4] for i in temp],
         'lower_95_ci': [i[5] for i in temp],
         'upper_95_ci': [i[6] for i in temp],
         'sum': [i[7] for i in temp]}
    return pd.DataFrame(d)

In [88]:
numeric_summary(df,deciles=True)

Unnamed: 0,feature,min,deciles,max,mean,std,lower_95_ci,upper_95_ci,sum
0,A,0.0,"[0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.4, 1.79999999...",3.0,1.333333,1.527525,-0.395224,3.061891,4.0
1,B,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09999999...",1.0,0.25,0.5,-0.24,0.74,1.0
