In [None]:
# Import necessary libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Load the dataset from Hugging Face 
dataset = load_dataset("lukebarousse/data_jobs")
df = dataset['train'].to_pandas()

# Clean up the 'job_posted_date' column by converting it to datetime
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'], errors='coerce')

In [14]:
df.loc[:10, 'salary_rate':'salary_hour_avg'].dropna(subset='salary_rate')

Unnamed: 0,salary_rate,salary_year_avg,salary_hour_avg


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        22003 non-null   floa

In [5]:
median_salary_year_avg = df['salary_year_avg'].median()
print("Median of salary_year_avg:", median_salary_year_avg)

Median of salary_year_avg: 115000.0


In [8]:
# Create a copy of the original dataframe to avoid modifying it
df_filled = df.copy()

# Calculate the median for salary_year_avg and salary_hour_avg
median_salary_year_avg = df['salary_year_avg'].median()

# Calculate the median for salary_hour_avg
median_salary_hour_avg = df_filled['salary_hour_avg'].median()

# Fill NaN values in the respective columns with the calculated medians
df_filled['salary_year_avg'] = df_filled['salary_year_avg'].fillna(median_salary_year_avg)
df_filled['salary_hour_avg'] = df_filled['salary_hour_avg'].fillna(median_salary_hour_avg)

In [11]:
df_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        785741 non-null  floa

In [19]:
df_filled.loc[:10, 'salary_rate':'salary_hour_avg']

Unnamed: 0,salary_rate,salary_year_avg,salary_hour_avg
0,,115000.0,45.98
1,,115000.0,45.98
2,,115000.0,45.98
3,,115000.0,45.98
4,,115000.0,45.98
5,,115000.0,45.98
6,,115000.0,45.98
7,,115000.0,45.98
8,,115000.0,45.98
9,,115000.0,45.98


# Removing Duplicates

In [20]:
# Create a copy of df_filled
df_unique = df_filled.copy()

# Remove duplicate rows
df_unique = df_unique.drop_duplicates()

# Calculate the number of rows before and after removing duplicates
rows_before = df_filled.shape[0]
rows_after = df_unique.shape[0]

# Print the number of rows dropped
print(f"Number of rows before removing duplicates: {rows_before}")
print(f"Number of rows after removing duplicates: {rows_after}")
print(f"Number of rows dropped: {rows_before - rows_after}")

Number of rows before removing duplicates: 785741
Number of rows after removing duplicates: 785640
Number of rows dropped: 101


In [25]:
df_unique = df_unique.drop_duplicates(subset=['job_title', 'company_name'])

# Print the number of rows dropped
rows_after_subset = df_unique.shape[0]
print(f"Number of rows before removing duplicates based on subset: {rows_before}")
print(f"Number of rows after removing duplicates based on subset: {rows_after_subset}")
print(f"Number of rows dropped based on subset: {rows_before - rows_after_subset}")

Number of rows before removing duplicates based on subset: 785741
Number of rows after removing duplicates based on subset: 508042
Number of rows dropped based on subset: 277699
