In [1]:
import pandas as pd
import numpy as np
import re

## Part 1: Upload the data

In [126]:
#☺ Visualize the data
data = pd.read_excel("/Users/Rachad/Desktop/Rachad/Ironhack/Modules_Labs_Ironhack/Module1/lab33/example_data_cleaning.xlsx")
data

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
0,4,34985,1923.0,5670,manager,78,Low
1,16,34997,1923.0,2399090,developer,78,High
2,25,35006,1923.0,33050,HR,78,High
3,12,34993,1939.142857,23430,professor,78,Low
4,21,35002,1939.142857,16770,manager,78,Low
5,11,34992,1943.857143,21210,researcher,78,Medium
6,20,35001,1943.857143,14550,student,78,Medium
7,3,34984,1945.0,3450,student,78,Medium
8,19,35000,1948.571429,12330,barmen,78,High
9,9,34990,1953.285714,16770,Manager,78,Medium


## Part 2: Analyze your data and create a plan for data preparation

In [74]:
# Start by calculating the descriptive statistics 
DescStats = data["Amount"].describe()
DescStats
# Analysis: We can already notice that the std is too high especially compared to the mean. 
# This will result in a very high variance. This shows that there is outliers that should be evaluated.  

count    3.000000e+01
mean     1.010097e+05
std      4.342616e+05
min      1.230000e+03
25%      1.233000e+04
50%      1.899000e+04
75%      2.953500e+04
max      2.399090e+06
Name: Amount, dtype: float64

In [None]:
##Planification for the cleaning:
#1- Examining Data for Potential Issues
#2- Identify and fill in missing values.
#3- Identify and correct incorrect values.
#4- Remove low variance columns.
#5- Identify potential outliers.
#6- Correct incorrect data types.
#7- Remove special characters and clean categorical variables.
#8- Identify and remove duplicate records.

## Part 3: Data cleansing (missing values, outliers, duplicates, data consistently)

#### 1- Examining Data for Potential Issues

In [75]:
#Start by removing trailing spaces at the end of the column names if there are any
data.columns = data.columns.str.rstrip(' ')

In [76]:
#Check the unique values and see if duplicates are due to different spelling or characters 
print('In the column Profession : ', data['Profession'].unique())
print('\n')
print('In the column Profession : ', data['Risk'].unique())

In the column Profession :  ['manager' 'developer' 'HR' 'professor' 'researcher' 'student' 'barmen'
 'Manager' 'bdm' nan 'hr' 'etudient' 'BDM' 'Hairdresser' 'Student'
 'Driver' 'sailer']


In the column Profession :  ['Low' 'High' 'Medium']


In [77]:
#The idea is to make all duplicates written the same way in order to be recognized as same with all functions
data['Profession'] = data['Profession'].str.replace('etudient', 'Student')
data['Profession'] = data['Profession'].str.capitalize()
data['Profession'] = data['Profession'].str.replace('Hr', 'HR')
print('repaired column Profession : ', data['Profession'].unique())

repaired column Profession :  ['Manager' 'Developer' 'HR' 'Professor' 'Researcher' 'Student' 'Barmen'
 'Bdm' nan 'Hairdresser' 'Driver' 'Sailer']


In [131]:
# We can notice that some of the years are miswritten like 1939.14? In this case we should get rid of what's after the dot
# We start by transforming the column BirthYear from int to strings

#data['BirthYear'] = data['BirthYear'].str.replace('(.)+(\d)', '')

#data['BirthYear'] = data.to_string(columns = ['BirthYear'])
#data['BirthYear'] = data['BirthYear'].apply(str)
#data['BirthYear'] = re.sub('(.)+(\d)', '', str(data['BirthYear']))
data['BirthYear'] = data['BirthYear'].astype('object')
# we can retransform the colmun into num by using .astype(int or float or object)...
data['BirthYear'].dtype
#data['BirthYear'] = re.sub(r'\.[0:9]', '', str(data['BirthYear']))

data['BirthYear'] = data[['BirthYear']].split('.', 1)[0]


AttributeError: 'DataFrame' object has no attribute 'split'

In [130]:
type(data['BirthYear'] )

pandas.core.series.Series

In [124]:
data

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
0,4,34985,0 1923\n1 1923\n2 1...,5670,manager,78,Low
1,16,34997,0 1923\n1 1923\n2 1...,2399090,developer,78,High
2,25,35006,0 1923\n1 1923\n2 1...,33050,HR,78,High
3,12,34993,0 1923\n1 1923\n2 1...,23430,professor,78,Low
4,21,35002,0 1923\n1 1923\n2 1...,16770,manager,78,Low
5,11,34992,0 1923\n1 1923\n2 1...,21210,researcher,78,Medium
6,20,35001,0 1923\n1 1923\n2 1...,14550,student,78,Medium
7,3,34984,0 1923\n1 1923\n2 1...,3450,student,78,Medium
8,19,35000,0 1923\n1 1923\n2 1...,12330,barmen,78,High
9,9,34990,0 1923\n1 1923\n2 1...,16770,Manager,78,Medium


#### 2- Identify and fill in missing values.

In [79]:
# missing values: we can notice there is two missing values for BirthYear and two for Profession
data.isnull().sum()

TransactionID    0
ClientID         0
BirthYear        2
Amount           0
Profession       2
Department       0
Risk             0
dtype: int64

##### Evaluating the missing value in Profession and BirthYear


In [80]:
# Check using the clientID if the missing values belong to clients that already have there information in the table
null_displ = data[(data['Profession'].isnull()==True) | (data['BirthYear'].isnull()==True)]
null_displ

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
15,28,35008,1967.0,46370,,78,High
16,29,35008,1976.0,50810,,78,Medium
28,22,34987,,18990,Sailer,78,High
29,7,34988,,12330,Manager,78,Medium


In [82]:
# We can notice that the clientIDs 35008, 34987 and 34988 is already mentionned in the table.

#     For the client 35008 we can notice that a typo of the year of birth was generated : 1976 instead of 1967. 
#     Based on The data the profession of this client seems to be Bdm

#     For the client 34987 we can notice that the profession seems to be different Manager and Sailer. 
#     I have a doubt about imputng the same year as the other two rows belonging to the same clientID

#     For the client 34988 we can notice two rows with same ID and same Profession. Seems to be the same person.  
#     I think we can replace the null value by the year 1999. 

test = data[(data['ClientID']==35008) | (data['ClientID']==34987) | (data['ClientID']==34988)]
test.sort_values("ClientID")

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
11,6,34987,1967.0,10110,Manager,78,Medium
12,15,34987,1967.0,30090,Manager,78,Low
28,22,34987,,18990,Sailer,78,High
22,23,34988,1999.0,25650,Manager,78,Low
29,7,34988,,12330,Manager,78,Medium
14,27,35008,1967.0,41930,Bdm,78,Low
15,28,35008,1967.0,46370,,78,High
16,29,35008,1976.0,50810,,78,Medium


In [97]:
# Now we can fill the missing values in the profession column. We can also correct the year of birth
data[['Profession']] = data[['Profession']].fillna('Bdm')
#data[['BirthYear']] = data[(data['ClientID'] == 34988) & (data['Profession'])].fillna('1999')
data[['BirthYear']] = data[(data['ClientID'] == 34987) & (data['Profession'] == 'Sailer')].fillna('1967')
data[(data['ClientID']==35008) | (data['ClientID']==34987) | (data['ClientID']==34988)]


ValueError: Columns must be same length as key

In [86]:
data

Unnamed: 0,TransactionID,ClientID,BirthYear,Amount,Profession,Department,Risk
0,4,34985,1923.0,5670,Manager,78,Low
1,16,34997,1923.0,2399090,Developer,78,High
2,25,35006,1923.0,33050,HR,78,High
3,12,34993,1939.142857,23430,Professor,78,Low
4,21,35002,1939.142857,16770,Manager,78,Low
5,11,34992,1943.857143,21210,Researcher,78,Medium
6,20,35001,1943.857143,14550,Student,78,Medium
7,3,34984,1945.0,3450,Student,78,Medium
8,19,35000,1948.571429,12330,Barmen,78,High
9,9,34990,1953.285714,16770,Manager,78,Medium


##### Evaluating the missing value in BirthYear


#### 5- Identify potential outliers.

In [58]:
data_pivot = data.pivot_table(index = ['ClientID','Profession','TransactionID','BirthYear'], 
                          values = ['Amount'], aggfunc={'Amount': np.sum})
data_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Amount
ClientID,Profession,TransactionID,BirthYear,Unnamed: 4_level_1
34982,Student,1,2013.0,12900
34983,Barmen,2,2015.0,1230
34984,Student,3,1945.0,3450
34985,Manager,4,1923.0,5670
34986,HR,5,1978.0,7890
34987,Manager,6,1967.0,10110
34987,Manager,15,1967.0,30090
34988,Manager,23,1999.0,25650
34989,HR,8,1958.0,14550
34989,HR,24,1967.0,27870


## Part 4: Encode categorical data

In [70]:
pd.get_dummies(data)

Unnamed: 0,TransactionID,ClientID,Amount,Department,BirthYear_1923.0,BirthYear_1939.14285714286,BirthYear_1943.85714285714,BirthYear_1945.0,BirthYear_1948.57142857143,BirthYear_1953.28571428571,...,Profession_HR,Profession_Hairdresser,Profession_Manager,Profession_Professor,Profession_Researcher,Profession_Sailer,Profession_Student,Risk_High,Risk_Low,Risk_Medium
0,4,34985,5670,78,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,16,34997,2399090,78,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,25,35006,33050,78,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,12,34993,23430,78,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,21,35002,16770,78,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
5,11,34992,21210,78,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
6,20,35001,14550,78,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
7,3,34984,3450,78,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
8,19,35000,12330,78,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9,9,34990,16770,78,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
