In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.4f}'.format

# Load the Dataset

In [2]:
df = pd.read_csv('data/badDataSet.csv', sep='\t', header=None, na_values='?')

In [3]:
df.columns = [f'A{i+1}' for i in range(df.shape[1] - 1)] + ['T']

# Task 1
## Number of Instances

In [4]:
print(df.shape)

(581012, 63)


## Number of Descriptive Features

In [5]:
print(df.shape[1] - 1)

62


## Data Quality Reports for Descriptive Features

In [6]:
print(df.nunique())

A1       1978
A2        361
A3     576099
A4         67
A5        569
A6     581012
A7     577988
A8       5811
A9        207
A10       186
A11       255
A12      5826
A13         2
A14         2
A15         2
A16         1
A17         1
A18         3
A19         2
A20         2
A21         2
A22         2
A23         2
A24         2
A25         2
A26         2
A27         2
A28         2
A29         2
A30         2
A31         2
A32         2
A33         2
A34         2
A35         2
A36         2
A37         2
A38         2
A39         2
A40         2
A41         2
A42         2
A43         2
A44         2
A45         2
A46         2
A47         2
A48         2
A49         2
A50         2
A51         2
A52         2
A53         2
A54         2
A55         2
A56         2
A57         2
A58         2
A59         2
A60         2
A61    581012
A62         1
T           7
dtype: int64


In [None]:
df_des_feat = df[[f'A{i}' for i in range(1, 63)]]

report = df_des_feat.describe().T

report['% Miss.'] = (df_des_feat.isnull().sum() / len(df_des_feat)) * 100
report['Card.'] = df_des_feat.nunique()

# Rename cols
report = report.rename(columns={
    'count': 'Count',
    'mean': 'Mean',
    'std': 'Std. Dev.',
    'min': 'Min.',
    '25%': '1st Qrt.',
    '50%': 'Median',
    '75%': '3rd Qrt.',
    'max': 'Max.'
})

final_column_order = ['Count', '% Miss.', 'Card.', 'Min.', '1st Qrt.', 'Mean', 'Median', '3rd Qrt.', 'Max.', 'Std. Dev.']

report_des_feat = report[final_column_order].reset_index().rename(columns={'index': 'Feature'})

print(report_des_feat)

   Feature       Count  % Miss.   Card.         Min.     1st Qrt.  \
0       A1 581012.0000   0.0000    1978 2054845.6500 3104928.1500   
1       A2 578069.0000   0.5065     361       0.0000      58.0000   
2       A3 581012.0000   0.0000  576099       0.0000     145.4945   
3       A4 580708.0000   0.0523      67       0.0000       9.0000   
4       A5 578069.0000   0.5065     569    -691.0000     108.0000   
5       A6 581012.0000   0.0000  581012    -173.0651       6.9858   
6       A7 578069.0000   0.5065  577988      -1.0000      -0.4997   
7       A8 581012.0000   0.0000    5811       0.0000    1106.0000   
8       A9 578069.0000   0.5065     207       0.0000     198.0000   
9      A11 581012.0000   0.0000     255       0.0000     119.0000   
10     A12 578069.0000   0.5065    5826       0.0000    1024.0000   
11     A13 578069.0000   0.5065       2       0.0000       0.0000   
12     A14 581012.0000   0.0000       2       0.0000       0.0000   
13     A15 581012.0000   0.0000   

## Data Quality Report for Response Feature

In [14]:
report = df[['T']].describe().T

report['% Miss.'] = (df['T'].isnull().sum() / len(df)) * 100
report['Card.'] = df['T'].nunique()

# Rename cols
report = report.rename(columns={
    'count': 'Count',
    'mean': 'Mean',
    'std': 'Std. Dev.',
    'min': 'Min.',
    '25%': '1st Qrt.',
    '50%': 'Median',
    '75%': '3rd Qrt.',
    'max': 'Max.'
})

final_column_order = ['Count', '% Miss.', 'Card.', 'Min.', '1st Qrt.', 'Mean', 'Median', '3rd Qrt.', 'Max.', 'Std. Dev.']

report_res_feat = report[final_column_order].reset_index().rename(columns={'index': 'Feature'})

print(report_res_feat)

  Feature       Count  % Miss.  Card.   Min.  1st Qrt.   Mean  Median  \
0       T 580952.0000   0.0103      7 1.0000    1.0000 2.0515  2.0000   

   3rd Qrt.   Max.  Std. Dev.  
0    2.0000 7.0000     1.3965  
