In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np

##### The Objective of the exercise:
 - to practice data cleaning and data manipulation on a data set
 - to gain insights on the data set

This is a generated data for data cleaning exercise, which contains data on people's name, age, city, occupation and salary.
Find:
- the highest and lowest salary in this data set
- the average salary
- the median salary

In [2]:
# Open your data file and store it in a dataframe
fileName = r"C:\Users\steph\Documents\ASSOCIATE DATA ANALYST COURSE MATERIALS\PYTHON ENTRY-LEVEL DATA ANALYST\Class Exercises\Datasets\Generated_Data_for_Cleaning_Exercise.csv"
df = pd.read_csv(fileName)

In [3]:
# Let's look at the data
# Look at the top of the data
df

Unnamed: 0,Name,Age,City,Occupation,Salary
0,Liam,63,san jose,Manager,95379
1,Olivia,31,NYC,,48359
2,thomas,46,San Diego,Designer,133816
3,Hannah,61,San Diego,Developer,147751
4,Uma,39,los angeles,Analyst,"$95,066"
...,...,...,...,...,...
100,Wendy,31,san diego,Designer,"$126,226"
101,Charlie,47,San Antonio,Nurse,108350
102,Grace,23,NYC,technician,40900
103,Victor,22,San antonio,,"$141,430"


In [4]:
# Look at the tail of the data
df.tail()

Unnamed: 0,Name,Age,City,Occupation,Salary
100,Wendy,31,san diego,Designer,"$126,226"
101,Charlie,47,San Antonio,Nurse,108350
102,Grace,23,NYC,technician,40900
103,Victor,22,San antonio,,"$141,430"
104,Quinn,24,Phoenix,,"$62,955"


In [5]:
# Let's take random sample from the data to get a better look at the data
df.sample(5)

Unnamed: 0,Name,Age,City,Occupation,Salary
5,Yara,42,Los Angeles,Engineer,
87,Grace,34,Phoenix,doctor,116047
85,Wendy,65,Phoenix,Manager,115679
44,Grace,32,,software developer,
33,THOMAS,43,NYC,Designer,"$55,335"


In [6]:
# Let's look at how many data we have - rows and columns
df.shape

(105, 5)

In [7]:
# Let's get more information on the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        105 non-null    object
 1   Age         105 non-null    int64 
 2   City        99 non-null     object
 3   Occupation  93 non-null     object
 4   Salary      76 non-null     object
dtypes: int64(1), object(4)
memory usage: 4.2+ KB


##### Observations so far:
1) There are 105 rows and 5 columns in the data
2) Salary column currently is not being stored in a numeric format
3) There are entries that have NaN values - these need to be cleaned up, especially for the Salary column where we would need to find the average salary


##### Data Cleaning to do:

In [8]:
# Let's see how many entries have empty values
df.isna().sum() #can use isnull also over here

Name           0
Age            0
City           6
Occupation    12
Salary        29
dtype: int64

In [11]:
# Let's sort and see which column has the most empty values
df.isnull().sum().sort_values(ascending=False)

Salary        29
Occupation    12
City           6
Name           0
Age            0
dtype: int64

In [13]:
# Let's take a closer look at the Salary column
df['Salary'].head(10)

0       95379
1       48359
2      133816
3      147751
4     $95,066
5         NaN
6         NaN
7       59255
8    $129,294
9     $48,365
Name: Salary, dtype: object

##### Observations:
- There are NaN values
- There are entries with numbers
- There are entries with str, example: '$95,066'

In [14]:
# Let's remove the '$'
df['Salary'] = df['Salary'].str.replace('$','')
df['Salary'].head(10)

0      95379
1      48359
2     133816
3     147751
4     95,066
5        NaN
6        NaN
7      59255
8    129,294
9     48,365
Name: Salary, dtype: object

In [15]:
# Let's remove the ','
df['Salary'] = df['Salary'].str.replace(',','')
df['Salary'].head(10)

0     95379
1     48359
2    133816
3    147751
4     95066
5       NaN
6       NaN
7     59255
8    129294
9     48365
Name: Salary, dtype: object

In [16]:
# Now we need to convert the Salary column from str object to numeric
df['Salary'] = pd.to_numeric(df['Salary']).astype(float)
df['Salary'].describe()


count        76.000000
mean      95304.000000
std       32866.132797
min       40900.000000
25%       63927.750000
50%       98756.000000
75%      121139.000000
max      149377.000000
Name: Salary, dtype: float64

In [None]:
# Remove the '$' symbol and commas from the 'Salary' column and convert to float
#  df['Salary'] = df['Salary'].str.replace('$', '', regex=False).str.replace(',', '').astype(float)

#  print(df['Salary'].head())


In [None]:
# Use a lambda function to remove '$' and ',' and convert to float, handling floats and strings

# df['Salary'] = df['Salary'].apply(lambda x: float(str(x).replace('$', '').replace(',', '')))

# print(df['Salary'].head())


In [None]:
# There are many different ways to deal with NaN values
# In this case, to keep thing simple, we are just going to replace the NaN value with 0
# we are assuming these people are currently not working, hence no salary at the moment

df.Occupation.replace({np.nan:0})

df['Occupation'] = df.Occupation.replace({np.nan:0})
df

In [None]:
# Now we need to convert the Salary column from str object to numeric


In [None]:
# Now that the Salary column is a numeric value (float), we can perform mathematical operations on it
# We can answer what is the average salary, highest salary and lowest salary

# average salary
df.Salary.mean()

In [None]:
# highest salary
df.Salary.max()

In [None]:
# lowest salary
df.Salary.min()

##### Who is the person with the highest salary and lowest salary?

In [19]:
# Person with the highest salary
#df.Name.iloc[df.Salary.idxmax()]

df.loc[df['Salary'] == df['Salary'].max()]

Unnamed: 0,Name,Age,City,Occupation,Salary
32,Wendy,23,NYC,,149377.0


In [20]:
# Person with the lowest salary -- this brings up people with no current salary at the moment
df.loc[df['Salary'] == df['Salary'].min()]

Unnamed: 0,Name,Age,City,Occupation,Salary
74,Grace,23,NYC,technician,40900.0
102,Grace,23,NYC,technician,40900.0


##### Do we have duplicate data in our data set?

In [21]:
# To check for duplicates 
df.loc[df.duplicated(keep=False)].sort_values(by='Name')
# this checks all the columns

Unnamed: 0,Name,Age,City,Occupation,Salary
93,Charlie,47,San Antonio,Nurse,108350.0
101,Charlie,47,San Antonio,Nurse,108350.0
74,Grace,23,NYC,technician,40900.0
102,Grace,23,NYC,technician,40900.0
97,Quinn,24,Phoenix,,62955.0
104,Quinn,24,Phoenix,,62955.0
14,Victor,22,San antonio,,141430.0
103,Victor,22,San antonio,,141430.0
18,Wendy,31,san diego,Designer,126226.0
100,Wendy,31,san diego,Designer,126226.0


In [22]:
# Be careful when checking for duplicates
df.loc[df.duplicated(subset=['Name'], keep=False)].sort_values(by='Name')
# As you can see here, there are 2 Bobs, one who lives in San Jose age 55, and one who lives in Chicago age 41. 
# These are Name duplicates but they are not duplicate data entries

Unnamed: 0,Name,Age,City,Occupation,Salary
77,Bob,55,San Jose,Teacher,77975.0
67,Bob,41,Chicago,Technician,
93,Charlie,47,San Antonio,Nurse,108350.0
101,Charlie,47,San Antonio,Nurse,108350.0
63,Charlie,47,Los Angeles,technician,121388.0
...,...,...,...,...,...
21,Yara,35,NYC,Manager,49317.0
61,quinn,43,Los angeles,Manager,69001.0
52,quinn,58,Philadelphia,Doctor,127011.0
20,thomas,40,Phoenix,Doctor,113299.0


In [23]:
# In this case, we are going to remove the duplicate data
df.drop_duplicates(keep = 'first', inplace = True)
df.head()

Unnamed: 0,Name,Age,City,Occupation,Salary
0,Liam,63,san jose,Manager,95379.0
1,Olivia,31,NYC,,48359.0
2,thomas,46,San Diego,Designer,133816.0
3,Hannah,61,San Diego,Developer,147751.0
4,Uma,39,los angeles,Analyst,95066.0


In [24]:
# Now when we double-check for duplicates, there are none (because we have already removed them)
df.loc[df.duplicated(keep=False)].sort_values(by='Name')

Unnamed: 0,Name,Age,City,Occupation,Salary


In [25]:
# Now if we checked, there are now 100 rows (previously 105)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        100 non-null    object 
 1   Age         100 non-null    int64  
 2   City        94 non-null     object 
 3   Occupation  90 non-null     object 
 4   Salary      71 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.7+ KB


In [27]:
# Let's create a new column called Job Status
# In this column, if the person has a Salary > 0, the status would be Employed, otherwise it's Unemployed

df['Job Status'] = np.where(df['Salary'] > 0, "Employed", "Unemployed")
df.head(20)

Unnamed: 0,Name,Age,City,Occupation,Salary,Job Status
0,Liam,63,san jose,Manager,95379.0,Employed
1,Olivia,31,NYC,,48359.0,Employed
2,thomas,46,San Diego,Designer,133816.0,Employed
3,Hannah,61,San Diego,Developer,147751.0,Employed
4,Uma,39,los angeles,Analyst,95066.0,Employed
5,Yara,42,Los Angeles,Engineer,,Unemployed
6,Paul,38,Philadelphia,nurse,,Unemployed
7,Yara,62,Houston,,59255.0,Employed
8,Eve,36,Houston,Manager,129294.0,Employed
9,Charlie,43,,Designer,48365.0,Employed


In [31]:
# Let's find out how many people are employed vs unemployed

df['Job Status'].value_counts()

Job Status
Employed      71
Unemployed    29
Name: count, dtype: int64

In [32]:
# Now in this data set, we can see some people have occupation listed as NaN but they have salary
df.loc[df['Occupation'].isna()]

Unnamed: 0,Name,Age,City,Occupation,Salary,Job Status
1,Olivia,31,NYC,,48359.0,Employed
7,Yara,62,Houston,,59255.0,Employed
14,Victor,22,San antonio,,141430.0,Employed
32,Wendy,23,NYC,,149377.0,Employed
41,Noah,30,san jose,,144998.0,Employed
47,UMA,24,new york,,119429.0,Employed
50,Uma,39,NYC,,41058.0,Employed
55,mia,60,San Diego,,,Unemployed
73,Liam,28,NYC,,,Unemployed
97,Quinn,24,Phoenix,,62955.0,Employed


In [33]:
# Let's designate them as 'Self-employed' for ppl with Salary
# first we designate all NaN as 'Self-employed'

df['Occupation'] = df['Occupation'].fillna('Self-employed')
df.loc[df['Occupation'].isna()]

# now all NaN is 'Self-employed'

Unnamed: 0,Name,Age,City,Occupation,Salary,Job Status


In [34]:
# let's find those 2 entries where the Salary = 0 and Occupation is self-employed
df.loc[(df['Salary'] == 0) & (df['Occupation'] == 'Self-employed')]

Unnamed: 0,Name,Age,City,Occupation,Salary,Job Status


In [37]:
# let's change 'Self-employed' to 'Unemployed' where Salary is 0
df['Occupation'] = np.where((df['Occupation'] == 'Self-employed') & (df['Salary'] == 0), 'Unemployed', df['Occupation'])

In [38]:
# When we check those data entries: 55 and 73, now they are showing both as Unemployed
display(df.iloc[55])
display(df.iloc[73])

Name                    mia
Age                      60
City              San Diego
Occupation    Self-employed
Salary                  NaN
Job Status       Unemployed
Name: 55, dtype: object

Name                   Liam
Age                      28
City                    NYC
Occupation    Self-employed
Salary                  NaN
Job Status       Unemployed
Name: 73, dtype: object

In [39]:
# Let's take a look at the data set so far
df.head(10)

Unnamed: 0,Name,Age,City,Occupation,Salary,Job Status
0,Liam,63,san jose,Manager,95379.0,Employed
1,Olivia,31,NYC,Self-employed,48359.0,Employed
2,thomas,46,San Diego,Designer,133816.0,Employed
3,Hannah,61,San Diego,Developer,147751.0,Employed
4,Uma,39,los angeles,Analyst,95066.0,Employed
5,Yara,42,Los Angeles,Engineer,,Unemployed
6,Paul,38,Philadelphia,nurse,,Unemployed
7,Yara,62,Houston,Self-employed,59255.0,Employed
8,Eve,36,Houston,Manager,129294.0,Employed
9,Charlie,43,,Designer,48365.0,Employed


In [40]:
# In the Name column, the name are entered inconsistently, let's capitalize the name
df['Name'] = df['Name'].str.capitalize()
df.head(10)

Unnamed: 0,Name,Age,City,Occupation,Salary,Job Status
0,Liam,63,san jose,Manager,95379.0,Employed
1,Olivia,31,NYC,Self-employed,48359.0,Employed
2,Thomas,46,San Diego,Designer,133816.0,Employed
3,Hannah,61,San Diego,Developer,147751.0,Employed
4,Uma,39,los angeles,Analyst,95066.0,Employed
5,Yara,42,Los Angeles,Engineer,,Unemployed
6,Paul,38,Philadelphia,nurse,,Unemployed
7,Yara,62,Houston,Self-employed,59255.0,Employed
8,Eve,36,Houston,Manager,129294.0,Employed
9,Charlie,43,,Designer,48365.0,Employed


In [None]:
# In the City column, the cities are entered inconsistently, let's capitalize each word, so san jose will become San Jose
df['City'] = df['City'].str.title()
df['City'] = df['City'].replace("Nyc", "New York")
df.head(10)

In [None]:
# Clean up the Occupation column (use any one of the codes below) 
# df["Occupation"] = df["Occupation"].apply(lambda x: x.capitalize())
# df['Occupation'] = df["Occupation"].replace("Developer", "Software Developer")
# df['Occupation'] = df["Occupation"].replace("Software developer", "Software Developer")

df.head(10)

In [None]:
# Group Salary by Occupation


In [None]:
# Group Salary by City


In [None]:
#Graphing total inhabitants per city

In [None]:
# Graphing the distribution of occupations in the data


In [None]:
# Histogram of salaries

In [None]:
# Scatterplot
