load libraries

In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


# Data processing : steps to flow: 
- Select Relevant Columns - Only keep the important columns for your analysis.
- Handle Missing Values - Either drop or fill missing data.
- Remove Duplicates - Remove duplicate rows from the dataset.
- Validate Data Types - Ensure numeric columns are properly formatted.
- Review Data Consistency - Check for consistency in categorical columns.

In [None]:
# Load the data using pandas
bls_data=pd.read_excel("./Ressources/all_data_M_2023.xlsx")
# Preview the first few rows to check if the data is being read correctly
bls_data.head()


In [None]:
# bls_data.describe()


In [None]:
# check the columns names
bls_data.columns

> For your BLS data, the most relevant columns to keep depend on your project's focus, but based on analyzing high-paying jobs and employment trends, these would likely be the most useful:

- AREA and AREA_TITLE: To identify the geographic region and perform location-based analysis.
- NAICS and NAICS_TITLE: For industry classification, which helps analyze trends by industry sector.
- OCC_CODE and OCC_TITLE: For occupation-specific analysis, which is crucial when looking at job types and wages.
- TOT_EMP: Total employment helps in analyzing job concentration and demand.
- H_MEAN and A_MEAN: Hourly and annual mean wages are essential for identifying high-paying jobs.
- H_MEDIAN and A_MEDIAN: Median wages to assess typical earnings in each role.
- H_PCT75 and A_PCT75, H_PCT90 and A_PCT90: These percentile columns help in understanding wage distribution at higher levels, useful for identifying the top earners.

## SELECT THE RELEVANT COLUMNS

 >the columns to keep:
* OCC_CODE and OCC_TITLE: For analyzing specific occupations and job categories.
* NAICS and NAICS_TITLE: If you're looking to correlate job salaries with specific industries.
* AREA and AREA_TITLE: If you're interested in geographic variations, such as comparing salaries across regions.

In [None]:
# Select the relevant columns from your DataFrame
relevant_columns = [
    'AREA', 'AREA_TITLE', 'NAICS', 'NAICS_TITLE', 
    'OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'PRIM_STATE',
    'H_MEAN', 'A_MEAN', 'H_MEDIAN', 'A_MEDIAN',
    'H_PCT75', 'A_PCT75', 'H_PCT90', 'A_PCT90'
]

# Subset the DataFrame
bls_df_clean = bls_data[relevant_columns]
bls_df_clean.head()

In [None]:
# Get the unique values for each column in the DataFrame
unique_values = bls_df_clean.apply(lambda x: x.unique())

# Display the unique values for each column
print(unique_values)

In [None]:
# Replace multiple specific special characters with NaN
bls_df_clean.replace({'#': np.nan, '*': np.nan, '@': np.nan, '$': np.nan}, inplace=True)

### Handel missing values

In [None]:
# Check for missing values in the selected columns
missing_values = bls_df_clean.isnull().sum()
print(missing_values)

In [None]:
bls_df_clean= bls_df_clean.dropna(how='any')
# bls_df_clean.info()
# check if we do have some duplicated valuesto be dorped 
# bls_df_clean.duplicated()

In [None]:
# Remove duplicate rows if any
# bls_df_clean=bls_df_clean.drop_duplicates(inplace=True)

In [None]:
# display a the first five row of my data 
bls_df_clean.head()

In [None]:

# Check summary statistics of the cleaned dataset
bls_df_clean.describe()

In [None]:

bls_df_clean.columns

In [None]:
# Group by AREA and calculate mean for each numeric column
area_summary = bls_df_clean.groupby('AREA').agg({
    'H_MEAN': 'mean',
    'A_MEAN': 'mean',
    'H_MEDIAN': 'mean',
    'A_MEDIAN': 'mean',
    'H_PCT75': 'mean',
    'A_PCT75': 'mean',
    'H_PCT90': 'mean',
    'A_PCT90': 'mean'
}).reset_index()

# Display summary statistics for each area
area_summary.head()

In [None]:
# filter dataset to include only rows where the annual mean wage (A_MEAN) or hourly wage (H_MEAN) exceeds the equivalent of $100K per year.
filtred_bls_df = bls_df_clean[(bls_df_clean['A_MEAN'] >= 100000) | (bls_df_clean['H_MEAN'] >= 48.08)]
filtred_bls_df.head()

In [None]:
# Verify Consistency: Ensure that all columns have consistent formatting and correct data types.
filtred_bls_df.dtypes

In [None]:
 # convert the total employment  to a numeric to ensure consistency
filtred_bls_df['TOT_EMP'] = pd.to_numeric(filtred_bls_df['TOT_EMP'], errors='coerce')
filtred_bls_df['OCC_CODE'] = pd.to_numeric(filtred_bls_df['OCC_CODE'], errors='coerce')

# recheck for a null values
filtred_bls_df['TOT_EMP'].isna().sum()
filtred_bls_df=filtred_bls_df.dropna(how='any')



In [None]:
# Ensure that AREA_TITLE, NAICS, NAICS_TITLE, OCC_CODE, and OCC_TITLE have consistent formatting (e.g., no
#  leading/trailing spaces, all uppercase/lowercase where necessary)

#str.strip to Remove leading/trailing spaces and str.title to standardize case
filtred_bls_df['AREA_TITLE'] = filtred_bls_df['AREA_TITLE'].str.strip().str.title()#strip spaces and standardize text formatting 
filtred_bls_df['OCC_TITLE'] = filtred_bls_df['OCC_TITLE'].str.strip().str.title()
filtred_bls_df['NAICS_TITLE'] = filtred_bls_df['NAICS_TITLE'].str.strip()

filtred_bls_df.dtypes

In [None]:
filtred_bls_df = filtred_bls_df.loc[filtred_bls_df['OCC_CODE'].str.contains('-', na=False)]

In [None]:
#datafinal check
# filtred_bls_df.head()
display(filtred_bls_df.tail(2))

In [None]:
# oews_data = pd.read_csv("./Ressources/educational_attainment.csv",delimiter=';')
# oews_data.head()
# oews_data.columns
Educ_data = pd.read_csv("./Ressources/usa_00006.csv",delimiter=',')
# Educ_data.columns
Educ_data.head()

In [None]:
Educ_data.columns
# Educ_data.reset_index()

In [None]:
Educ_data.isnull().sum()

In [None]:
unique_values_2 = Educ_data.apply(lambda x: x.unique())
unique_values_2

In [None]:
relevant_c = ['REGION','STATEICP' ,'IND','OCCSOC', 'INCTOT', 'INCWAGE', 'EDUC', 'EDUCD', 'SEX', 'AGE']
filtred_owes_df=Educ_data[relevant_c]
filtred_owes_df.head()

In [None]:
# filtred_owes_df=filtred_owes_df.duplicated()
filtred_owes_df.drop_duplicates(inplace=True)

In [None]:
filtred_owes_df.dtypes

In [None]:
filtred_owes_df.info()

In [None]:
filtred_owes_df.head()

In [None]:
display(filtred_bls_df.head(2))
display(filtred_bls_df.tail(2))

In [None]:
display(filtred_owes_df.tail())
display(filtred_owes_df.head())


In [None]:
filtred_owes_df['OCCSOC'] = filtred_owes_df['OCCSOC'].str.strip()


In [None]:
filtred_owes_df.head(2)

In [None]:
# combined_df = pd.merge(filtred_bls_df,filtred_owes_df,on=['OCC_CODE', how= 'inner'])