### 6 Data quality dimensions are

* Completeness
* Accuracy
* Consistency
* Validity
* Uniqueness
* Integrity

In [351]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [352]:
df = pd.read_csv('./financials.csv')
df.head()

Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Month NameZ,Year
0,Government,Canada,Carretera,,"$1,618.50",$3.00,$20.00,"$32,370.00",$-,"$32,370.00","$16,185.00","$16,185.00",01/01/2014,1,January,2014
1,Government,Germany,Carretera,,"$1,321.00",$3.00,$20.00,"$26,420.00",$-,"$26,420.00","$13,210.00","$13,210.00",01/01/2014,1,January,2014
2,Midmarket,France,Carretera,,"$2,178.00",$3.00,$15.00,"$32,670.00",$-,"$32,670.00","$21,780.00","$10,890.00",01/06/2014,6,June,2014
3,Midmarket,Germany,Carretera,,$888.00,$3.00,$15.00,"$13,320.00",$-,"$13,320.00","$8,880.00","$4,440.00",01/06/2014,6,June,2014
4,Midmarket,Mexico,Carretera,,"$2,470.00",$3.00,$15.00,"$37,050.00",$-,"$37,050.00","$24,700.00","$12,350.00",01/06/2014,6,June,2014


In [353]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Segment              700 non-null    object
 1   Country              700 non-null    object
 2   Product              700 non-null    object
 3   Discount Band        700 non-null    object
 4   Units Sold           700 non-null    object
 5   Manufacturing Price  700 non-null    object
 6   Sale Price           700 non-null    object
 7   Gross Sales          700 non-null    object
 8   Discounts            700 non-null    object
 9   Sales                700 non-null    object
 10  COGS                 700 non-null    object
 11  Profit               700 non-null    object
 12  Date                 700 non-null    object
 13  Month Number         700 non-null    int64 
 14  Month NameZ          700 non-null    object
 15  Year                 700 non-null    int64 
dtypes: int64

### Cleaning data

##### Removing $ and , from monetary numerical values and changing data type to Float

In [354]:
df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']] = df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']].map(lambda x: x.strip())
df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']] = df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']].map(lambda x: re.sub('[^0-9.]','',x), na_action='ignore')
df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']] = df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']].map(lambda x: x.strip())
df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']] = df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']].map(lambda x: re.sub('[\s+]',' ',x),na_action='ignore')
df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']] = df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']].replace('',0.0)
df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']] = df[['Units Sold','Manufacturing Price','Sale Price','Gross Sales','Sales','COGS','Profit','Discounts']].astype('float')
df.head()

Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Month NameZ,Year
0,Government,Canada,Carretera,,1618.5,3.0,20.0,32370.0,0.0,32370.0,16185.0,16185.0,01/01/2014,1,January,2014
1,Government,Germany,Carretera,,1321.0,3.0,20.0,26420.0,0.0,26420.0,13210.0,13210.0,01/01/2014,1,January,2014
2,Midmarket,France,Carretera,,2178.0,3.0,15.0,32670.0,0.0,32670.0,21780.0,10890.0,01/06/2014,6,June,2014
3,Midmarket,Germany,Carretera,,888.0,3.0,15.0,13320.0,0.0,13320.0,8880.0,4440.0,01/06/2014,6,June,2014
4,Midmarket,Mexico,Carretera,,2470.0,3.0,15.0,37050.0,0.0,37050.0,24700.0,12350.0,01/06/2014,6,June,2014


##### Replacing discount band with appropriate ordinal values

In [355]:
print(df['Discount Band'].unique())
df['Discount Band'].replace({' None ':0,' Low ':1,' Medium ':2,' High ':3},inplace=True)
df.head()

[' None ' ' Low ' ' Medium ' ' High ']


Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Month NameZ,Year
0,Government,Canada,Carretera,0,1618.5,3.0,20.0,32370.0,0.0,32370.0,16185.0,16185.0,01/01/2014,1,January,2014
1,Government,Germany,Carretera,0,1321.0,3.0,20.0,26420.0,0.0,26420.0,13210.0,13210.0,01/01/2014,1,January,2014
2,Midmarket,France,Carretera,0,2178.0,3.0,15.0,32670.0,0.0,32670.0,21780.0,10890.0,01/06/2014,6,June,2014
3,Midmarket,Germany,Carretera,0,888.0,3.0,15.0,13320.0,0.0,13320.0,8880.0,4440.0,01/06/2014,6,June,2014
4,Midmarket,Mexico,Carretera,0,2470.0,3.0,15.0,37050.0,0.0,37050.0,24700.0,12350.0,01/06/2014,6,June,2014


##### Dropping month name as month number already given

In [356]:
df.drop(columns=['Month NameZ'],inplace=True)
df.head()

Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Year
0,Government,Canada,Carretera,0,1618.5,3.0,20.0,32370.0,0.0,32370.0,16185.0,16185.0,01/01/2014,1,2014
1,Government,Germany,Carretera,0,1321.0,3.0,20.0,26420.0,0.0,26420.0,13210.0,13210.0,01/01/2014,1,2014
2,Midmarket,France,Carretera,0,2178.0,3.0,15.0,32670.0,0.0,32670.0,21780.0,10890.0,01/06/2014,6,2014
3,Midmarket,Germany,Carretera,0,888.0,3.0,15.0,13320.0,0.0,13320.0,8880.0,4440.0,01/06/2014,6,2014
4,Midmarket,Mexico,Carretera,0,2470.0,3.0,15.0,37050.0,0.0,37050.0,24700.0,12350.0,01/06/2014,6,2014


##### Changing date column data type

In [357]:
df['Date'] = pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Year
0,Government,Canada,Carretera,0,1618.5,3.0,20.0,32370.0,0.0,32370.0,16185.0,16185.0,2014-01-01,1,2014
1,Government,Germany,Carretera,0,1321.0,3.0,20.0,26420.0,0.0,26420.0,13210.0,13210.0,2014-01-01,1,2014
2,Midmarket,France,Carretera,0,2178.0,3.0,15.0,32670.0,0.0,32670.0,21780.0,10890.0,2014-01-06,6,2014
3,Midmarket,Germany,Carretera,0,888.0,3.0,15.0,13320.0,0.0,13320.0,8880.0,4440.0,2014-01-06,6,2014
4,Midmarket,Mexico,Carretera,0,2470.0,3.0,15.0,37050.0,0.0,37050.0,24700.0,12350.0,2014-01-06,6,2014


## Data quality assurance

### 1. Completeness

#### It can be said that the data is complete as all needed parameters for product sales such as 'Discount Band', 'Units Sold','Manufacturing Price', 'Sale Price', 'Gross Sales', 'Discounts','Sales', 'COGS', 'Profit' and 'Date' are given

### 2. Accuracy

#### There is currently no way for verifying accuracy of this dataset

### 3. Consistency

#### There is currently no way for verifying consistency of this dataset

### 4. Validity

#### Considering profit as a validity parameter

In [358]:
inconsistent = df[round(df['Profit'],2) != abs(round(df['Sales'] - df['COGS'], 2))]
print('{} entries are inconsistent according to profit-sales tally'.format(len(inconsistent)))
inconsistent

0 entries are inconsistent according to profit-sales tally


Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Year


### 5. Uniqueness

In [359]:
df[df.duplicated()]

Unnamed: 0,Segment,Country,Product,Discount Band,Units Sold,Manufacturing Price,Sale Price,Gross Sales,Discounts,Sales,COGS,Profit,Date,Month Number,Year


#### Dataset only contains unique datapoints

### 6. Integrity

#### There is currently no way for verifying integrity of this dataset