# Ejercicio Data Quality - Perfilado
## Evaluar la calidad de datos de las ventas de productos

Se quiere hacer una evaluación de calidad de datos sobre las ventas (sales) y pagos (payments). Para ello se requiere hacer un análisis de los siguientes puntos:
- Calidad de los datos
- Selección de clave principal
- Identificación de cardinalidad
- Obtener media, varianza y desviacion Estandar, covarianza, correlacion
- Mejorar la calidad.

**Referencia**: “Estadística Descriptiva con Python y Pandas”: https://coderhook.github.io/Descriptive%20Statistics

- Columnas sales:, orderNumber, orderLineNumber, orderDate, shippedDate, requiredDate, customerNumber, employeeNumber, productCode, status, comments, quantityOrdered, priceEach, sales_amount, origin

- Columnas payments:, customerNumber, checkNumber, paymentDate, amount

## Carga

In [352]:
import pandas as pd
import numpy as np
from tabulate import tabulate

In [353]:
sales_df = pd.read_csv(
    'https://github.com/ricardoahumada/DataScienceBasics/raw/refs/heads/main/data/company_sales/sales.csv')

In [354]:
payments_df = pd.read_csv(
    'https://github.com/ricardoahumada/DataScienceBasics/raw/refs/heads/main/data/company_sales/payments.csv')

## Calidad

### Sales

In [None]:
# columnas
sales_df.columns = ['orderNumber', 'orderLineNumber', 'orderDate', 'shippedDate', 'requiredDate', 'customerNumber',
                    'employeeNumber', 'productCode', 'status', 'comments', 'quantityOrdered', 'priceEach', 'sales_amount', 'origin']
sales_df.info()

In [None]:
sales_df.head(5)

In [None]:
sales_df.tail(5)

In [None]:
sales_df.sample(20)

In [None]:
sales_df.shape

In [360]:
sales_df_clean = sales_df.drop(columns=['comments', 'orderDate',
                                        'shippedDate', 'requiredDate'])

In [None]:
sales_df_clean.info()

In [None]:
# nulos
sales_df_clean.isna().sum()
# sales_df_clean.dropna(inplace=True)
# sales_df_clean.isna().sum()

In [None]:
# extremos
z_scores = (sales_df_clean-sales_df_clean.mean(numeric_only=True)) / \
    sales_df_clean.std(numeric_only=True)
z_scores_abs = z_scores.apply(np.abs)
print(tabulate(z_scores_abs, headers='keys'))

In [None]:
umbral = 3

out_mask = ~z_scores[z_scores_abs > umbral].isna()
print('\nOutliers per column:\n')
print(out_mask.sum())

In [None]:
outliers = sales_df_clean['quantityOrdered'][out_mask['quantityOrdered']]
print('Outliers:\n', outliers)

In [None]:
sales_df_clean['quantityOrdered'].describe()

In [None]:
sales_df_clean.drop(outliers, inplace=True)
sales_df_clean.shape

In [None]:
# duplicados
sales_df_clean[sales_df_clean.duplicated()]

In [410]:
sales_df_clean['complete_order_number'] = sales_df_clean['orderNumber'].astype(
    'str')+'-'+sales_df_clean['orderLineNumber'].astype('str')

In [None]:
sales_df_clean.head()

In [None]:
sales_df_clean.info()

In [None]:
dup_ordnums = sales_df_clean[sales_df_clean.duplicated(
)]['complete_order_number']


dup_ordnums.values

In [None]:
sales_df_clean[sales_df_clean['complete_order_number'].isin(
    dup_ordnums.values)]

In [None]:
sales_df_clean.drop_duplicates(inplace=True)
sales_df_clean[sales_df_clean.duplicated()]

In [None]:
# incoherencias
sales_df_clean.info()

In [None]:
sales_df_clean['status'].unique()

In [None]:
sales_df_clean['productCode'].unique()

In [None]:
# cardinalidad
def calc_cardinalidad(adf):
    result = {}
    for col in adf.columns:
        print('\n- Valores únicos para "{0}"'.format(col), '\n')
        print(adf[col].unique())
        card = len(adf[col].unique())
        print('Num valores únicos: ', len(adf[col].unique()))
        result[col] = card

    return result


sales_card = calc_cardinalidad(sales_df_clean)
print(sales_card)

In [None]:
sales_df_clean.columns

In [47]:
sales_df_clean[['productCode', 'status', 'origin']] = sales_df_clean[[
    'productCode', 'status', 'origin']].astype('category')

In [None]:
sales_df_clean.info()

In [None]:
sales_df_clean.describe()

In [None]:
sales_df_clean.describe(include='category')

In [None]:
# frecuencias
for col in sales_df_clean.columns:

    print('\n- Frecuencias para "{0}"'.format(col), '\n')

    print(sales_df_clean[col].value_counts())

In [None]:
sales_df_clean.columns

In [None]:
# correlación
sales_corr = sales_df_clean.corr('pearson', numeric_only=True)
sales_corr

In [None]:
sales_corr[np.abs(sales_corr) >= 0.7]

In [None]:
# sesgo

sales_skw = sales_df_clean.skew(numeric_only=True)
sales_skw

In [None]:
sales_skw[np.abs(sales_skw) > 2]

In [None]:
# kurtosis
sales_kurt = sales_df_clean.kurt(numeric_only=True)
sales_kurt

In [None]:
sales_kurt[sales_kurt > np.abs(3)]

### payments

In [419]:
payments_df.columns = ['customerNumber',
                       'checkNumber', 'paymentDate', 'amount']


payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   customerNumber  278 non-null    int64  
 1   checkNumber     278 non-null    object 
 2   paymentDate     278 non-null    object 
 3   amount          278 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 8.8+ KB


In [417]:
payments_df.isna().sum()

0              0
checkNumber    0
0000-00-00     0
0.00           0
dtype: int64

In [423]:
# extremos
amount_col = payments_df['amount']


q1 = np.percentile(amount_col, 25)
q3 = np.percentile(amount_col, 75)
iqr = q3 - q1
print('iqr:\n', iqr)

umbra_sup = q3+1.5*iqr
umbra_inf = q1-1.5*iqr

print('umbrales inf:\n', umbra_inf)
print('\numbrales sup:\n', umbra_sup)

iqr:
 29892.835000000003
umbrales inf:
 -29695.117500000004

umbrales sup:
 89876.2225


In [437]:
am_outliers = amount_col[((amount_col < umbra_inf) | (amount_col > umbra_sup))]
am_outliers

Index([17, 23, 41, 43, 61], dtype='int64')

In [436]:
amount_col.describe()

count       278.000000
mean      31827.944281
std       21096.143249
min         615.450000
25%       15144.135000
50%       31369.150000
75%       45036.970000
max      120166.580000
Name: amount, dtype: float64

In [438]:
payments_df.drop(am_outliers.index, inplace=True)
payments_df.shape

(273, 4)

In [439]:
#duplicados
payments_df.duplicated().sum()

np.int64(5)

In [440]:
payments_df[payments_df.duplicated()]

Unnamed: 0,customerNumber,checkNumber,paymentDate,amount
32,129,ID449593,2003-12-11,13923.93
86,175,CITI3434344,2005-05-19,14500.78
144,260,IO164641,2004-08-30,13527.58
215,381,GB117430,2005-02-03,7379.9
269,487,AH612904,2003-09-28,14997.09


In [441]:
payments_df['customer-check'] = payments_df['customerNumber'].astype(
    str)+'-'+payments_df['checkNumber'].astype(str)
payments_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 273 entries, 0 to 277
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   customerNumber  273 non-null    int64  
 1   checkNumber     273 non-null    object 
 2   paymentDate     273 non-null    object 
 3   amount          273 non-null    float64
 4   customer-check  273 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 12.8+ KB


In [442]:
payments_df[payments_df.duplicated()]

Unnamed: 0,customerNumber,checkNumber,paymentDate,amount,customer-check
32,129,ID449593,2003-12-11,13923.93,129-ID449593
86,175,CITI3434344,2005-05-19,14500.78,175-CITI3434344
144,260,IO164641,2004-08-30,13527.58,260-IO164641
215,381,GB117430,2005-02-03,7379.9,381-GB117430
269,487,AH612904,2003-09-28,14997.09,487-AH612904


In [443]:
cust_check_ids = payments_df[payments_df.duplicated()]['customer-check'].values
cust_check_ids

array(['129-ID449593', '175-CITI3434344', '260-IO164641', '381-GB117430',
       '487-AH612904'], dtype=object)

In [444]:
payments_df[payments_df['customer-check'].isin(cust_check_ids)]

Unnamed: 0,customerNumber,checkNumber,paymentDate,amount,customer-check
31,129,ID449593,2003-12-11,13923.93,129-ID449593
32,129,ID449593,2003-12-11,13923.93,129-ID449593
85,175,CITI3434344,2005-05-19,14500.78,175-CITI3434344
86,175,CITI3434344,2005-05-19,14500.78,175-CITI3434344
143,260,IO164641,2004-08-30,13527.58,260-IO164641
144,260,IO164641,2004-08-30,13527.58,260-IO164641
214,381,GB117430,2005-02-03,7379.9,381-GB117430
215,381,GB117430,2005-02-03,7379.9,381-GB117430
268,487,AH612904,2003-09-28,14997.09,487-AH612904
269,487,AH612904,2003-09-28,14997.09,487-AH612904


In [445]:
def doNothing(x):
    return list(x)[0]


added_payments_df = payments_df.groupby('customer-check').agg(
    {'amount': 'sum', 'customerNumber': doNothing, 'checkNumber': doNothing, 'paymentDate': doNothing}).reset_index()
added_payments_df[added_payments_df['customer-check'].isin(cust_check_ids)]

Unnamed: 0,customer-check,amount,customerNumber,checkNumber,paymentDate
29,129-ID449593,27847.86,129,ID449593,2003-12-11
79,175-CITI3434344,29001.56,175,CITI3434344,2005-05-19
136,260-IO164641,27055.16,260,IO164641,2004-08-30
206,381-GB117430,14759.8,381,GB117430,2005-02-03
259,487-AH612904,29994.18,487,AH612904,2003-09-28


In [446]:
# incoherencias
added_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   customer-check  268 non-null    object 
 1   amount          268 non-null    float64
 2   customerNumber  268 non-null    int64  
 3   checkNumber     268 non-null    object 
 4   paymentDate     268 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 10.6+ KB


In [447]:
added_payments_df['paymentDate'] = pd.to_datetime(
    added_payments_df['paymentDate'])

added_payments_df['checkNumber'] = added_payments_df['checkNumber'].astype(
    'category')
added_payments_df['customer-check'] = added_payments_df['customer-check'].astype(
    'category')

In [448]:
added_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   customer-check  268 non-null    category      
 1   amount          268 non-null    float64       
 2   customerNumber  268 non-null    int64         
 3   checkNumber     268 non-null    category      
 4   paymentDate     268 non-null    datetime64[ns]
dtypes: category(2), datetime64[ns](1), float64(1), int64(1)
memory usage: 27.8 KB


In [449]:
payments_card = calc_cardinalidad(added_payments_df)
print(payments_card)


- Valores únicos para "customer-check" 

['103-HQ336336', '103-JM555205', '103-OM314933', '112-BO864823', '112-HQ55022', ..., '495-BH167026', '495-FN155234', '496-EU531600', '496-MB342426', '496-MN89921']
Length: 268
Categories (268, object): ['103-HQ336336', '103-JM555205', '103-OM314933', '112-BO864823', ..., '495-FN155234', '496-EU531600', '496-MB342426', '496-MN89921']
Num valores únicos:  268

- Valores únicos para "amount" 

[ 6066.78 14571.44  1676.14 14191.12 32641.98 33347.88 45864.03 82261.22
  7565.08 44894.74 19501.82 47924.19 49523.67 50218.95  1491.38 17876.32
 34638.14 85410.87 11044.3  83598.04 47142.7  55639.66 43369.3  45084.38
 10549.01 24101.81 33820.62  7466.32 26248.78 27847.86 16537.85 22292.62
 50025.35 35321.97 36251.03 36140.38 46895.48 59830.55 65071.26 49539.37
 40206.2  63843.55 35420.74 20009.53 26155.91 36005.71  7674.94  4710.73
 28211.7  20564.86 53959.21 40978.53 49614.72 39712.1  44380.15  2611.84
  3516.04 58793.53 20314.44 58841.35 39964.63 35152.1

In [450]:
# frecuencias
for col in added_payments_df.columns:
    print('\n- Frecuencias para "{0}"'.format(col), '\n')
    print(added_payments_df[col].value_counts())


- Frecuencias para "customer-check" 

customer-check
496-MN89921     1
103-HQ336336    1
103-JM555205    1
103-OM314933    1
112-BO864823    1
               ..
121-MA302151    1
121-KI831359    1
121-FD317790    1
121-DB889831    1
119-NG94694     1
Name: count, Length: 268, dtype: int64

- Frecuencias para "amount" 

amount
52166.00    1
6066.78     1
14571.44    1
1676.14     1
14191.12    1
           ..
34638.14    1
17876.32    1
1491.38     1
50218.95    1
49523.67    1
Name: count, Length: 268, dtype: int64

- Frecuencias para "customerNumber" 

customerNumber
141    11
124     7
128     4
161     4
121     4
       ..
473     2
239     1
211     1
415     1
450     1
Name: count, Length: 98, dtype: int64

- Frecuencias para "checkNumber" 

checkNumber
PT550181    1
AB661578    1
AD304085    1
AD832091    1
AE192287    1
           ..
AU750837    1
AU364101    1
AP286625    1
AO757239    1
AM968797    1
Name: count, Length: 268, dtype: int64

- Frecuencias para "paymentDate" 


In [451]:
# correlación
payments_corr = added_payments_df.corr('pearson', numeric_only=True)
payments_corr

Unnamed: 0,amount,customerNumber
amount,1.0,-0.137231
customerNumber,-0.137231,1.0


In [452]:
payments_corr[np.abs(payments_corr) >= 0.7]

Unnamed: 0,amount,customerNumber
amount,1.0,
customerNumber,,1.0


In [453]:
# sesgo

payments_skw = added_payments_df.skew(numeric_only=True)
payments_skw

amount            0.437524
customerNumber    0.314037
dtype: float64

In [454]:
payments_skw[np.abs(payments_skw) > 2]

Series([], dtype: float64)

In [455]:
# kurtosis
payments_kurt = added_payments_df.kurt(numeric_only=True)
payments_kurt

amount            0.079298
customerNumber   -1.201093
dtype: float64

In [456]:
payments_kurt[payments_kurt > np.abs(3)]

Series([], dtype: float64)

### Mezclado de datos

In [474]:
sales_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2988 entries, 0 to 3000
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   orderNumber            2988 non-null   int64  
 1   orderLineNumber        2988 non-null   int64  
 2   customerNumber         2988 non-null   int64  
 3   employeeNumber         2988 non-null   int64  
 4   productCode            2988 non-null   object 
 5   status                 2988 non-null   object 
 6   quantityOrdered        2988 non-null   int64  
 7   priceEach              2988 non-null   float64
 8   sales_amount           2988 non-null   float64
 9   origin                 2988 non-null   object 
 10  complete_order_number  2988 non-null   object 
dtypes: float64(2), int64(5), object(4)
memory usage: 280.1+ KB


In [475]:
added_payments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   customer-check  268 non-null    category      
 1   amount          268 non-null    float64       
 2   customerNumber  268 non-null    int64         
 3   checkNumber     268 non-null    category      
 4   paymentDate     268 non-null    datetime64[ns]
dtypes: category(2), datetime64[ns](1), float64(1), int64(1)
memory usage: 27.8 KB


In [476]:
merged_df = pd.merge(sales_df_clean, added_payments_df,
                     on='customerNumber', how='left')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11070 entries, 0 to 11069
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   orderNumber            11070 non-null  int64         
 1   orderLineNumber        11070 non-null  int64         
 2   customerNumber         11070 non-null  int64         
 3   employeeNumber         11070 non-null  int64         
 4   productCode            11070 non-null  object        
 5   status                 11070 non-null  object        
 6   quantityOrdered        11070 non-null  int64         
 7   priceEach              11070 non-null  float64       
 8   sales_amount           11070 non-null  float64       
 9   origin                 11070 non-null  object        
 10  complete_order_number  11070 non-null  object        
 11  customer-check         11070 non-null  category      
 12  amount                 11070 non-null  float64       
 13  c

In [477]:
merged_df.head()

Unnamed: 0,orderNumber,orderLineNumber,customerNumber,employeeNumber,productCode,status,quantityOrdered,priceEach,sales_amount,origin,complete_order_number,customer-check,amount,checkNumber,paymentDate
0,10100,1,363,1216,S24_3969,Shipped,49,35.29,1729.21,spain,10100-1,363-HL575273,50799.69,HL575273,2004-11-17
1,10100,1,363,1216,S24_3969,Shipped,49,35.29,1729.21,spain,10100-1,363-IS232033,10223.83,IS232033,2003-01-16
2,10100,1,363,1216,S24_3969,Shipped,49,35.29,1729.21,spain,10100-1,363-PN238558,55425.77,PN238558,2003-12-05
3,10100,2,363,1216,S18_2248,Shipped,50,55.09,2754.5,spain,10100-2,363-HL575273,50799.69,HL575273,2004-11-17
4,10100,2,363,1216,S18_2248,Shipped,50,55.09,2754.5,spain,10100-2,363-IS232033,10223.83,IS232033,2003-01-16


In [478]:
merged_df.tail()

Unnamed: 0,orderNumber,orderLineNumber,customerNumber,employeeNumber,productCode,status,quantityOrdered,priceEach,sales_amount,origin,complete_order_number,customer-check,amount,checkNumber,paymentDate
11065,10425,12,119,1370,S10_4962,In Process,38,131.49,4996.62,spain,10425-12,119-LN373447,47924.19,LN373447,2004-08-08
11066,10425,12,119,1370,S10_4962,In Process,38,131.49,4996.62,spain,10425-12,119-NG94694,49523.67,NG94694,2005-02-22
11067,10425,13,119,1370,S18_4600,In Process,38,107.76,4094.88,spain,10425-13,119-DB933704,19501.82,DB933704,2004-11-14
11068,10425,13,119,1370,S18_4600,In Process,38,107.76,4094.88,spain,10425-13,119-LN373447,47924.19,LN373447,2004-08-08
11069,10425,13,119,1370,S18_4600,In Process,38,107.76,4094.88,spain,10425-13,119-NG94694,49523.67,NG94694,2005-02-22


#### Insights by Sales and payments

In [479]:
customer_sales_pays = merged_df.groupby('customerNumber').agg(num=('complete_order_number', 'count'), tot_sale=(
    'sales_amount', 'sum'), tot_ammount=('amount', 'sum')).reset_index()

customer_sales_pays

Unnamed: 0,customerNumber,num,tot_sale,tot_ammount
0,103,21,66943.08,156200.52
1,112,87,240542.94,2325248.42
2,114,220,722340.28,9932178.85
3,119,159,475719.36,6198333.04
4,121,128,416899.16,3335193.28
...,...,...,...,...
93,486,66,223295.61,1709984.98
94,487,30,85140.74,638511.90
95,489,24,59172.30,355033.80
96,495,36,131083.48,1179751.32


In [480]:
print('# top ten por número de compras')
customer_sales_pays.sort_values('num', ascending=False)[
    ['customerNumber', 'num']].head(10)

# top ten por número de compras


Unnamed: 0,customerNumber,num
9,141,2849
5,124,1260
2,114,220
14,151,192
58,323,184
47,276,184
67,353,164
3,119,159
26,187,153
11,145,144


In [481]:
print('# top ten por monto de compras')
customer_sales_pays.sort_values('tot_sale', ascending=False)[
    ['customerNumber', 'tot_sale']].head(10)

# top ten por monto de compras


Unnamed: 0,customerNumber,tot_sale
9,141,9027584.94
5,124,4142791.38
2,114,722340.28
14,151,711655.8
58,323,618488.32
47,276,548136.88
11,145,516340.48
67,353,507932.76
3,119,475719.36
13,148,468693.0


In [482]:
print('# top ten por monto de pagos')
customer_sales_pays.sort_values('tot_ammount', ascending=False)[
    ['customerNumber', 'tot_ammount']].head(10)

# top ten por monto de pagos


Unnamed: 0,customerNumber,tot_ammount
9,141,124155300.0
5,124,66832060.0
2,114,9932179.0
14,151,8539870.0
26,187,7568915.0
58,323,7112616.0
47,276,6303574.0
3,119,6198333.0
48,278,5738836.0
97,496,5495865.0


#### Insights by origin

In [483]:
by_origin = merged_df.groupby('origin').agg(num=('complete_order_number', 'count'), tot_sale=(
    'sales_amount', 'sum'), tot_ammount=('amount', 'sum')).reset_index()

by_origin

Unnamed: 0,origin,num,tot_sale,tot_ammount
0,japan,385,1311738.0,9623561.0
1,spain,10685,34198629.7,409937600.0


#### Insights by date

In [484]:
paymentDate = merged_df['paymentDate']

by_date = merged_df.groupby([paymentDate.dt.year, paymentDate.dt.month]).agg(num=(
    'orderNumber', 'count'), tot_sale=('sales_amount', 'sum'), tot_ammount=('amount', 'sum'))

by_date.index.names = ['year', 'month']

by_date

Unnamed: 0_level_0,Unnamed: 1_level_0,num,tot_sale,tot_ammount
year,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2003,1,81,264884.69,717057.94
2003,2,327,1053999.45,13962943.76
2003,3,189,564074.4,7656653.14
2003,4,314,1066448.75,5603591.46
2003,5,239,765701.4,6398322.22
2003,6,135,442817.16,6024837.25
2003,7,468,1470009.14,13198441.27
2003,8,153,473438.78,4755693.99
2003,9,134,396909.32,4782035.76
2003,10,515,1656871.86,19166694.82


In [485]:
print('# top años por número de compras')
by_date.sort_values('num', ascending=False).groupby('year').agg({'num': sum})

# top años por número de compras


  by_date.sort_values('num', ascending=False).groupby('year').agg({'num': sum})


Unnamed: 0_level_0,num
year,Unnamed: 1_level_1
2003,3916
2004,5463
2005,1691


In [486]:
print('# top meses por número de compras')
by_date.groupby('month').agg({'num': sum}).sort_values(
    'num', ascending=False).head(3)

# top meses por número de compras


  by_date.groupby('month').agg({'num': sum}).sort_values(


Unnamed: 0_level_0,num
month,Unnamed: 1_level_1
11,1719
12,1512
5,1126


In [489]:
merged_df_corr = merged_df.corr('pearson', numeric_only=True)
merged_df_corr

Unnamed: 0,orderNumber,orderLineNumber,customerNumber,employeeNumber,quantityOrdered,priceEach,sales_amount,amount
orderNumber,1.0,-0.043316,-0.048589,0.077729,0.062886,-0.003145,0.036512,0.067693
orderLineNumber,-0.043316,1.0,-0.04492,-0.02097,-0.030415,0.000763,-0.025681,0.08576
customerNumber,-0.048589,-0.04492,1.0,0.03623,-0.006134,-0.008989,-0.007327,-0.246693
employeeNumber,0.077729,-0.02097,0.03623,1.0,-0.014445,-0.021298,-0.02608,0.044556
quantityOrdered,0.062886,-0.030415,-0.006134,-0.014445,1.0,0.025146,0.569006,0.010418
priceEach,-0.003145,0.000763,-0.008989,-0.021298,0.025146,1.0,0.807126,-0.00204
sales_amount,0.036512,-0.025681,-0.007327,-0.02608,0.569006,0.807126,1.0,0.00279
amount,0.067693,0.08576,-0.246693,0.044556,0.010418,-0.00204,0.00279,1.0


In [495]:
merged_df_corr[(merged_df_corr > 0.7) & (merged_df_corr != 1)]

Unnamed: 0,orderNumber,orderLineNumber,customerNumber,employeeNumber,quantityOrdered,priceEach,sales_amount,amount
orderNumber,,,,,,,,
orderLineNumber,,,,,,,,
customerNumber,,,,,,,,
employeeNumber,,,,,,,,
quantityOrdered,,,,,,,,
priceEach,,,,,,,0.807126,
sales_amount,,,,,,0.807126,,
amount,,,,,,,,
