# Pandas: Data Cleaning

In [47]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [48]:
df.loc[:10, 'salary_year_avg':'salary_hour_avg']

Unnamed: 0,salary_year_avg,salary_hour_avg
0,,
1,,
2,,
3,,
4,,
5,,
6,,
7,,
8,,
9,,


In [49]:
median_salary_year = df['salary_year_avg'].median()

In [50]:
median_salary_hour = df['salary_hour_avg'].median()

In [51]:
df_filled = df.copy()

df_filled['salary_year_avg'] = df_filled['salary_year_avg'].fillna(median_salary_year)
df_filled['salary_hour_avg'] = df_filled['salary_hour_avg'].fillna(median_salary_hour)

In [52]:
df_unique = df_filled.copy()

df_unique = df_unique.drop_duplicates()

print('Length of original df:       ', len(df_filled))
print('Length of drop duplicates df:', len(df_unique))
print('Rows Dropped:                 ', len(df_filled) - len(df_unique))

Length of original df:        785741
Length of drop duplicates df: 785640
Rows Dropped:                  101


In [53]:
df_unique = df_filled.copy()

df_unique = df_unique.drop_duplicates(subset=['job_title', 'company_name'])

print('Length of original df:       ', len(df_filled))
print('Length of drop duplicates df:', len(df_unique))
print('Rows Dropped:                 ', len(df_filled) - len(df_unique))

Length of original df:        785741
Length of drop duplicates df: 508042
Rows Dropped:                  277699


# Problems

In [54]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

## Remove Rows with Missing Values (2.2.1) - Problem

In [55]:

df_clean = df.dropna(subset='salary_year_avg')

print('Length of original df:       ', len(df))
print('Length of drop na df:', len(df_clean))
print('Rows Dropped:                 ', len(df) - len(df_clean))

Length of original df:        785741
Length of drop na df: 22003
Rows Dropped:                  763738


## Remove Duplicate Rows (2.2.2) - Problem

In [56]:
df_unique_location = df.drop_duplicates(subset=['job_location'])

print('Length of original df:       ', len(df))
print('Length of unique location df:', len(df_unique_location))
print('Rows Dropped:                 ', len(df) - len(df_unique_location))

Length of original df:        785741
Length of unique location df: 17218
Rows Dropped:                  768523


## Fill Missing Values in Column (2.2.3) - Problem

In [58]:
unknown = 'Unknown'

df_unknown = df.copy()

df_unknown['salary_rate'] = df_unknown['salary_rate'].fillna(unknown)

print(df.loc[:10, 'salary_rate'])
print(df_unknown.loc[:10, 'salary_rate'])

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
Name: salary_rate, dtype: object
0     Unknown
1     Unknown
2     Unknown
3     Unknown
4     Unknown
5     Unknown
6     Unknown
7     Unknown
8     Unknown
9     Unknown
10    Unknown
Name: salary_rate, dtype: object
