In [1]:
from basics import missing_rate, zero_rate, dtypes_class
from basics import feature_variance, freq_items_df

import pandas as pd

d = {'A': [1, 0, None, 3],
     'B': [1, 0, 0, 0],
     'C': ['a', None, 'c', 'd']}

# create DataFrame
df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,1,a
1,0.0,0,
2,,0,c
3,3.0,0,d


In [2]:
pd.__version__

'0.24.1'

In [3]:
df.B.mode(2)

0    0
dtype: int64

In [4]:
print(missing_rate(df))

  feature  missing_rate
0       A          0.25
1       B          0.00
2       C          0.25


In [5]:
print(zero_rate(df))

  feature  zero_rate
0       A   0.333333
1       B   0.750000
2       C   0.000000


In [6]:
(num_fields, cat_fields, bool_fields, data_types, data_class) = dtypes_class(df)

print(num_fields)
print(cat_fields)
print(bool_fields)
print(data_types)
print(data_class)

['A', 'B']
['C']
[]
  feature   dtypes
0       A  float64
1       B    int64
2       C   object
  feature   dtypes     class
0       A  float64   numeric
1       B    int64   numeric
2       C   object  category


In [7]:
data_class.groupby('class').count().reset_index()

Unnamed: 0,class,feature,dtypes
0,category,1,1
1,numeric,2,2


In [8]:
feature_variance(df)

Unnamed: 0,feature,feature_variance
0,A,1.0
1,B,0.5
2,C,1.0


In [9]:
d ={
    'num': list('1223334444'),
    'cat': list('wxxyyyzzzz')
}
df = pd.DataFrame(d)
df = df.astype({"num": int, "cat": object})

In [10]:
df

Unnamed: 0,num,cat
0,1,w
1,2,x
2,2,x
3,3,y
4,3,y
5,3,y
6,4,z
7,4,z
8,4,z
9,4,z


In [11]:
df.dtypes

num     int64
cat    object
dtype: object

In [12]:
df = pd.read_csv('Heart.csv')
df.head(5)

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [13]:
test = freq_items_df(df, top_n=4)
test

Unnamed: 0,feature,top_items,top_freqs
0,Age,"[58, 57, 54, 59]","[19, 17, 16, 14]"
1,Sex,"[1, 0]","[206, 97]"
2,ChestPain,"[asymptomatic, nonanginal, nontypical, typical]","[144, 86, 50, 23]"
3,RestBP,"[120, 130, 140, 110]","[37, 36, 32, 19]"
4,Chol,"[197, 234, 204, 212]","[6, 6, 6, 5]"
5,Fbs,"[0, 1]","[258, 45]"
6,RestECG,"[0, 2, 1]","[151, 148, 4]"
7,MaxHR,"[162, 160, 163, 152]","[11, 9, 9, 8]"
8,ExAng,"[0, 1]","[204, 99]"
9,Oldpeak,"[0.0, 1.2, 1.0, 0.6]","[99, 17, 14, 14]"


In [14]:
#test.to_csv('test.csv')

In [15]:
def feature_len(df_in):
    def fea_len(f):
        temp = f.map(lambda x: len(str(x)))
        return temp.min(),temp.max()
    
    temp = df_in.apply(fea_len)
    d = {'feature':  df_in.columns,
         'min_length': [i[0] for i in temp],
         'max_length': [i[1] for i in temp]}
    return pd.DataFrame(d)

In [76]:
d = {'A': [1, 0, None, 3],
     'B': [1, 0, 0, 0],
     'C': ['a', None, 'c', 'd']}

# create DataFrame
df = pd.DataFrame(d)

In [86]:
def pos_rate(f):
    return ((f > 0).sum(axis=0)) / f.notnull().sum()

In [87]:
df = df[['A','B']]
df

Unnamed: 0,A,B
0,1.0,1
1,0.0,0
2,,0
3,3.0,0


In [88]:
df.apply(pos_rate)

A    0.666667
B    0.250000
dtype: float64

In [77]:
feature_len(df)

Unnamed: 0,feature,min_length,max_length
0,A,3,3
1,B,1,1
2,C,1,4


In [68]:
df

Unnamed: 0,A,B,C
0,1.0,1,a
1,0.0,0,
2,,0,c
3,3.0,0,d


In [69]:
df.A.map(lambda x: len(str(x)))

0    3
1    3
2    3
3    3
Name: A, dtype: int64

In [70]:
import numpy as np

def percentiles_df(df_in, deciles = False):

    if deciles:
        percentiles = np.array(range(0, 110, 10))
    else:
        percentiles = [25, 50, 75]

    def deciles(f):
        return  np.percentile(f[f.notnull()], percentiles)

    temp = df_in.apply(deciles)
    d = {'feature':  df_in.columns,
         'percentiles': [i[0] for i in temp]}
    return pd.DataFrame(d)
    

In [71]:
df = df[['A','B']]
df

Unnamed: 0,A,B
0,1.0,1
1,0.0,0
2,,0
3,3.0,0


In [140]:
def numeric_summary(df_in, deciles=False):
    """
    generate statistical summary for numerical DateFrame

    :param df_in: input pandas DataFrame
    :param deciles: flag for percentiles style
    :return: statistical summary for numerical data

    >>> d = {'A': [1, 0, None, 3],
    >>>      'B': [1, 0, 0, 0],
    >>>      'C': ['a', None, 'c', 'd']}
    >>> # create DataFrame
    >>> df = pd.DataFrame(d)
    >>> (num_fields, cat_fields, bool_fields, data_types, type_class) = dtypes_class(df)
    >>> print(numeric_summary(df[num_fields]))
      feature  min       percentiles  max  ...       std  lower_95_ci  upper_95_ci  sum
    0       A  0.0   [0.5, 1.0, 2.0]  3.0  ...  1.527525    -0.395224     3.061891  4.0
    1       B  0.0  [0.0, 0.0, 0.25]  1.0  ...  0.500000    -0.240000     0.740000  1.0
    """
    
    (num_fields, cat_fields, bool_fields, data_types, data_class) = dtypes_class(df_in)
    df_in = df_in[num_fields]
    
    if deciles:
        var_name = 'deciles'
        percentiles = np.array(range(0, 110, 10))
    else:
        var_name = 'percentiles'
        percentiles = [25, 50, 75]

    def col_wise(f):
        fea_len = f.map(lambda x: len(str(x)))
        fea_mean = f.mean()
        fea_std = f.std()
        fea_count = np.sqrt(f.count())
        fea_notnull = f.notnull().sum()
        return fea_len.min(),\
               fea_len.max(),\
               f.shape[0],\
               f.count(),\
               len(f.unique()),\
               f.min(), \
               np.percentile(f[f.notnull()], percentiles), \
               f.max(), \
               fea_mean, \
               f.std(), \
               fea_mean - 1.96 * fea_std  / fea_count , \
               fea_mean + 1.96 * fea_std  / fea_count,  \
               f.sum(),\
               f.isnull().sum() /f.shape[0],\
               ((f == 0).sum(axis=0)) / fea_notnull,\
               ((f > 0).sum(axis=0)) / fea_notnull,\
               ((f < 0).sum(axis=0)) / fea_notnull

    temp = np.transpose(df_in.apply(col_wise))
    
    col_names = ['feature','data_type','min_digits','max_digits','row_count',
                 'notnull_count','distinct_count', 'min',var_name,'max','mean',
                 'std','lower_95_ci','upper_95_ci','sum','missing_rate','zero_rate',
                 'pos_rate','neg_rate']
    col_value = [df_in.columns] + [df_in.dtypes] \
              + [[col[i] for col in temp] for i in range(len(col_names)-2)]
    
    d = {key: value for key, value in zip(col_names, col_value)}
    return pd.DataFrame(d)

In [141]:
numeric_summary(df,deciles=False)

Unnamed: 0,feature,data_type,min_digits,max_digits,row_count,notnull_count,distinct_count,min,percentiles,max,mean,std,lower_95_ci,upper_95_ci,sum,missing_rate,zero_rate,pos_rate,neg_rate
Age,Age,int64,4,4,303,303,41,29.0,"[48.0, 56.0, 61.0]",77.0,54.438944,9.038662,53.421199,55.456689,16495.0,0.0,0.0,1.0,0.0
Sex,Sex,int64,3,3,303,303,2,0.0,"[0.0, 1.0, 1.0]",1.0,0.679868,0.467299,0.627251,0.732485,206.0,0.0,0.320132,0.679868,0.0
RestBP,RestBP,int64,4,5,303,303,50,94.0,"[120.0, 130.0, 140.0]",200.0,131.689769,17.599748,129.708054,133.671484,39902.0,0.0,0.0,1.0,0.0
Chol,Chol,int64,5,5,303,303,152,126.0,"[211.0, 241.0, 275.0]",564.0,246.693069,51.776918,240.863037,252.523101,74748.0,0.0,0.0,1.0,0.0
Fbs,Fbs,int64,3,3,303,303,2,0.0,"[0.0, 0.0, 0.0]",1.0,0.148515,0.356198,0.108407,0.188622,45.0,0.0,0.851485,0.148515,0.0
RestECG,RestECG,int64,3,3,303,303,3,0.0,"[0.0, 1.0, 2.0]",2.0,0.990099,0.994971,0.878066,1.102132,300.0,0.0,0.49835,0.50165,0.0
MaxHR,MaxHR,int64,4,5,303,303,91,71.0,"[133.5, 153.0, 166.0]",202.0,149.607261,22.875003,147.031557,152.182965,45331.0,0.0,0.0,1.0,0.0
ExAng,ExAng,int64,3,3,303,303,2,0.0,"[0.0, 0.0, 1.0]",1.0,0.326733,0.469794,0.273834,0.379631,99.0,0.0,0.673267,0.326733,0.0
Oldpeak,Oldpeak,float64,3,3,303,303,40,0.0,"[0.0, 0.8, 1.6]",6.2,1.039604,1.161075,0.908868,1.17034,315.0,0.0,0.326733,0.673267,0.0
Slope,Slope,int64,3,3,303,303,3,1.0,"[1.0, 2.0, 2.0]",3.0,1.60066,0.616226,1.531274,1.670047,485.0,0.0,0.0,1.0,0.0


In [93]:
df = pd.read_csv('Heart.csv')
df.head(5)

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [94]:
df.dtypes

Age            int64
Sex            int64
ChestPain     object
RestBP         int64
Chol           int64
Fbs            int64
RestECG        int64
MaxHR          int64
ExAng          int64
Oldpeak      float64
Slope          int64
Ca           float64
Thal          object
AHD           object
dtype: object

In [95]:
(num_fields, cat_fields, bool_fields, data_types, type_class) = dtypes_class(df)
numeric_summary(df[num_fields],deciles=False)

Unnamed: 0,feature,data_type,min_digits,max_digits,row_count,notnull_count,num_distinct,min,percentiles,max,mean,std,lower_95_ci,upper_95_ci,sum,missing_rate,zero_rate,pos_rate,neg_rate
Age,Age,int64,4,4,303,303,41,29.0,"[48.0, 56.0, 61.0]",77.0,54.438944,9.038662,53.421199,55.456689,16495.0,0.0,0.0,1.0,0.0
Sex,Sex,int64,3,3,303,303,2,0.0,"[0.0, 1.0, 1.0]",1.0,0.679868,0.467299,0.627251,0.732485,206.0,0.0,0.320132,0.679868,0.0
RestBP,RestBP,int64,4,5,303,303,50,94.0,"[120.0, 130.0, 140.0]",200.0,131.689769,17.599748,129.708054,133.671484,39902.0,0.0,0.0,1.0,0.0
Chol,Chol,int64,5,5,303,303,152,126.0,"[211.0, 241.0, 275.0]",564.0,246.693069,51.776918,240.863037,252.523101,74748.0,0.0,0.0,1.0,0.0
Fbs,Fbs,int64,3,3,303,303,2,0.0,"[0.0, 0.0, 0.0]",1.0,0.148515,0.356198,0.108407,0.188622,45.0,0.0,0.851485,0.148515,0.0
RestECG,RestECG,int64,3,3,303,303,3,0.0,"[0.0, 1.0, 2.0]",2.0,0.990099,0.994971,0.878066,1.102132,300.0,0.0,0.49835,0.50165,0.0
MaxHR,MaxHR,int64,4,5,303,303,91,71.0,"[133.5, 153.0, 166.0]",202.0,149.607261,22.875003,147.031557,152.182965,45331.0,0.0,0.0,1.0,0.0
ExAng,ExAng,int64,3,3,303,303,2,0.0,"[0.0, 0.0, 1.0]",1.0,0.326733,0.469794,0.273834,0.379631,99.0,0.0,0.673267,0.326733,0.0
Oldpeak,Oldpeak,float64,3,3,303,303,40,0.0,"[0.0, 0.8, 1.6]",6.2,1.039604,1.161075,0.908868,1.17034,315.0,0.0,0.326733,0.673267,0.0
Slope,Slope,int64,3,3,303,303,3,1.0,"[1.0, 2.0, 2.0]",3.0,1.60066,0.616226,1.531274,1.670047,485.0,0.0,0.0,1.0,0.0
