<a href="https://colab.research.google.com/github/silvia-denanni/DI_HACKATHON2/blob/main/Job_Market_Insights_Dashboard_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Core imports for ML + data processing + visualization
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: for better text features (job title)
from sklearn.feature_extraction.text import TfidfVectorizer


In [11]:
df = pd.read_excel('/Job Market Insights Dashboard.xlsx')
pd.set_option('display.max_columns', None)
print(df.head())

   industry_id company_specialty    job_id job_benefit_type  \
0           44       real estate    921716          Unknown   
1            0           Unknown   1829192          Unknown   
2           32           Unknown  10998357          Unknown   
3            9  Civil Litigation  23221523           401(k)   
4          122           Unknown  35982263          Unknown   

   job_benefit_mention        industry_type            company_name  \
0                    0          Real Estate   Corcoran Sawyer Smith   
1                    0              Unknown                 Unknown   
2                    0          Restaurants  The National Exemplar    
3                    1         Law Practice  Abrams Fensterman, LLP   
4                    0  Facilities Services                 Unknown   

                                               title  \
0                              Marketing Coordinator   
1                  Mental Health Therapist/Counselor   
2                        A

In [13]:
print(f"Shape: {df.shape}")

Shape: (36059, 34)


In [14]:
print(f"Columns: {df.columns.tolist()}")

Columns: ['industry_id', 'company_specialty', 'job_id', 'job_benefit_type', 'job_benefit_mention', 'industry_type', ' company_name', 'title', 'description', 'job_area', 'pay_period', 'location', 'state', 'city', 'views', 'max_salary', 'med_salary', 'min_salary', 'normalized_salary', 'formatted_work_type', 'applies', 'original_listed_time', 'remote_allowed', 'job_posting_url', 'application_url', 'application_type', 'expiry_time', 'seniority', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'fips']


In [16]:
df['application_url'] = df['application_url'].fillna('Unknown')

In [17]:
df['description'] = df['description'].fillna('Unknown')

In [18]:
df.isnull().sum()

Unnamed: 0,0
industry_id,0
company_specialty,0
job_id,0
job_benefit_type,0
job_benefit_mention,0
industry_type,0
company_name,0
title,0
description,0
job_area,0


In [20]:
df.describe()

Unnamed: 0,industry_id,job_id,job_benefit_mention,views,max_salary,med_salary,min_salary,normalized_salary,applies,original_listed_time,remote_allowed,expiry_time,sponsored,fips
count,36059.0,36059.0,36059.0,36059.0,36059.0,36059.0,36059.0,36059.0,36059.0,36059,36059.0,36059,36059.0,36059.0
mean,186.628886,3895229000.0,0.292271,19.385147,75963.04,3834.21872,53631.24,205406.8,2.89556,2024-04-15 08:23:46.495465984,0.134613,2024-05-23 02:53:55.075847680,0.0,20041.817965
min,0.0,921716.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2024-01-26 23:00:00,0.0,2024-04-12 19:26:40,0.0,0.0
25%,17.0,3894896000.0,0.0,3.0,24.0,0.0,18.95,52000.0,0.0,2024-04-11 21:13:20,0.0,2024-05-12 02:13:20,0.0,4013.0
50%,45.0,3901980000.0,0.0,5.0,55000.0,0.0,40000.0,81500.0,0.0,2024-04-18 00:26:40,0.0,2024-05-18 16:33:20,0.0,13121.0
75%,96.0,3904571000.0,1.0,10.0,125000.0,0.0,90000.0,125000.0,0.0,2024-04-18 22:40:00,0.0,2024-05-19 03:40:00,0.0,36061.0
max,3252.0,3906267000.0,1.0,9975.0,120000000.0,750000.0,85000000.0,535600000.0,967.0,2024-04-19 23:40:00,1.0,2024-10-16 23:06:40,0.0,56045.0
std,568.323592,100864600.0,0.454812,122.748733,638238.8,23350.005793,451495.2,5098615.0,17.129164,,0.341314,,0.0,18385.129232


In [34]:
#TOP 5 INDUSTRIES PER SALARY (MEAN)
most_paid_industries = df.groupby('industry_type')['normalized_salary'].mean().round(2).sort_values(ascending=False)
most_paid_industries.head()

Unnamed: 0_level_0,normalized_salary
industry_type,Unnamed: 1_level_1
Oil and Gas,3248050.37
Computer Hardware Manufacturing,1031463.66
Wholesale,815919.37
Law Practice,576800.5
Staffing and Recruiting,479075.93


In [35]:
#BOTTOM 5 INDUSTRIES PER SALARY (MEAN)
least_paid_industries = df.groupby('industry_type')['normalized_salary'].mean().round(2).sort_values(ascending=False)
least_paid_industries.tail()

Unnamed: 0_level_0,normalized_salary
industry_type,Unnamed: 1_level_1
Glass Product Manufacturing,35000.0
Sports and Recreation Instruction,28790.83
Armed Forces,25137.33
Telephone Call Centers,24965.5
Mattress and Blinds Manufacturing,62.5


In [37]:
# JOB LISTING POSTING TIME MEAN
mean_listed_time = df['original_listed_time'].mean()
print(f"Mean Listed Time: {mean_listed_time}")

Mean Listed Time: 2024-04-15 08:23:46.495465984


In [38]:
# JOB LISTING EXPIRY TIME MEAN
mean_expiry_time = df['expiry_time'].mean()
print(f"Mean Expiry Time: {mean_expiry_time}")


Mean Expiry Time: 2024-05-23 02:53:55.075847680


In [46]:
#MOST AND LEAST APPLIED FOR INDUSTRIES (5)
popular_industries = df.groupby(['industry_id', 'industry_type'])['applies'].count()
popular_industries.sort_values(ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,applies
industry_id,industry_type,Unnamed: 2_level_1
14,Hospitals and Health Care,3610
104,Staffing and Recruiting,2975
96,IT Services and IT Consulting,1780
27,Retail,1669
43,Financial Services,1621
...,...,...
3247,Robot Manufacturing,1
3191,Consumer Electronics,1
3131,Unknown,1
3246,Accessible Architecture and Design,1


In [23]:
df.columns

Index(['industry_id', 'company_specialty', 'job_id', 'job_benefit_type',
       'job_benefit_mention', 'industry_type', ' company_name', 'title',
       'description', 'job_area', 'pay_period', 'location', 'state', 'city',
       'views', 'max_salary', 'med_salary', 'min_salary', 'normalized_salary',
       'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry_time', 'seniority', 'posting_domain',
       'sponsored', 'work_type', 'currency', 'compensation_type', 'fips'],
      dtype='object')

#Linear regression model N1
**predicting salary** based on job **title** and **state** features


In [47]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np

# Handle potential NaN in 'title' column before vectorization
df['title'] = df['title'].fillna('')
df['state'] = df['state'].fillna('Unknown')

# Encode 'state' column
le = LabelEncoder()
df['state_encoded'] = le.fit_transform(df['state'])

# Vectorize 'title' column
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Limiting features to avoid high dimensionality
title_features = tfidf_vectorizer.fit_transform(df['title'])

# Convert title features to DataFrame to concatenate
title_features_df = pd.DataFrame(title_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Prepare features (X) and target (y)
X = pd.concat([df[['state_encoded']].reset_index(drop=True), title_features_df], axis=1)
y = df['normalized_salary']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (28847, 1001)
Shape of X_test: (7212, 1001)
Shape of y_train: (28847,)
Shape of y_test: (7212,)


In [48]:
# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = linear_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Linear Regression Model Performance:")
print(f"R-squared (R2) score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

Linear Regression Model Performance:
R-squared (R2) score: -0.1002
Mean Absolute Error (MAE): 454040.84
Mean Squared Error (MSE): 13248365378233.95
Root Mean Squared Error (RMSE): 3639830.41


The Linear Regression model has been trained and evaluated. The R-squared score, Mean Absolute Error (MAE), Mean Squared Error (MSE), and Root Mean Squared Error (RMSE) provide insights into how well the model predicts `normalized_salary` based on the given features. A low R-squared and high error values might indicate that these features alone are not strong predictors for salary, or that a more complex model might be needed.

# Random Forest Regressor model
 to ***compare*** with **Linear Regression Model N1**

In [49]:
#Initialize and train a RandomForestRegressor model using the same
#training data (X_train, y_train) that was used for the Linear Regression
# model

from sklearn.ensemble import RandomForestRegressor

# Initialize the RandomForestRegressor model
# Setting random_state for reproducibility
rf_model = RandomForestRegressor(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

print("RandomForestRegressor model trained successfully.")

RandomForestRegressor model trained successfully.


**Predictions after training**



In [50]:
y_pred_rf = rf_model.predict(X_test)

r2_rf = r2_score(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

print(f"RandomForestRegressor Model Performance:")
print(f"R-squared (R2) score: {r2_rf:.4f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.2f}")

RandomForestRegressor Model Performance:
R-squared (R2) score: -1.7222
Mean Absolute Error (MAE): 236885.04
Mean Squared Error (MSE): 32781168099678.59
Root Mean Squared Error (RMSE): 5725484.09


## Compare Model Performance
Display the performance metrics (R-squared, MAE, MSE, RMSE) for both the Linear Regression and Random Forest Regressor models side-by-side for easy comparison


In [51]:
print("\n--- Model Performance Comparison ---\n")

print("Linear Regression Model:")
print(f"  R-squared (R2) score: {r2:.4f}")
print(f"  Mean Absolute Error (MAE): {mae:.2f}")
print(f"  Mean Squared Error (MSE): {mse:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse:.2f}")

print("\nRandom Forest Regressor Model:")
print(f"  R-squared (R2) score: {r2_rf:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"  Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_rf:.2f}")


--- Model Performance Comparison ---

Linear Regression Model:
  R-squared (R2) score: -0.1002
  Mean Absolute Error (MAE): 454040.84
  Mean Squared Error (MSE): 13248365378233.95
  Root Mean Squared Error (RMSE): 3639830.41

Random Forest Regressor Model:
  R-squared (R2) score: -1.7222
  Mean Absolute Error (MAE): 236885.04
  Mean Squared Error (MSE): 32781168099678.59
  Root Mean Squared Error (RMSE): 5725484.09


## Summary:

### Q&A
Based on the evaluation metrics, the Linear Regression model performed better than the Random Forest Regressor model. The Random Forest Regressor model showed significantly poorer performance with a highly negative R-squared score and larger MSE/RMSE values, indicating it struggled to fit the data.

### Data Analysis Key Findings
*   The Random Forest Regressor model achieved an R-squared (\R^2\ ) score of -1.7222, a Mean Absolute Error (MAE) of 236,885.04, a Mean Squared Error (MSE) of 32,781,168,099,678.59, and a Root Mean Squared Error (RMSE) of 5,725,484.09.
*   In comparison, the Linear Regression model resulted in an R-squared (\R^2\ ) score of -0.1002, a Mean Absolute Error (MAE) of 454,040.84, a Mean Squared Error (MSE) of 13,248,365,378,233.95, and a Root Mean Squared Error (RMSE) of 3,639,830.41.
*   Both models exhibited negative R-squared values, indicating they performed worse than simply predicting the mean of the target variable.
*   The Random Forest Regressor had a lower MAE but substantially higher MSE and RMSE compared to the Linear Regression model, suggesting that while its average absolute error might be smaller, it makes larger errors on some predictions, leading to a much worse overall fit as reflected by its R-squared score.

### Insights or Next Steps
*   Both models show very poor performance as indicated by negative R-squared values, suggesting that the current features are not sufficient to predict normalized salary effectively, or the dataset has characteristics that make these models unsuitable without further preprocessing or feature engineering.
*   Further investigation is needed into the data distribution, outliers, feature scaling, and potential feature engineering (e.g., polynomial features, interaction terms) for both models. Additionally, exploring other regression algorithms or hyperparameter tuning for the Random Forest Regressor could be beneficial.


# Linear Regression model N2
using **state**, **industry_type**, and **formatted_work_type** as features to **predict salary**

In [52]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np

# Ensure no NaN values in the selected columns before encoding
df['state'] = df['state'].fillna('Unknown')
df['industry_type'] = df['industry_type'].fillna('Unknown')
df['formatted_work_type'] = df['formatted_work_type'].fillna('Unknown')

# Initialize LabelEncoders
le_state = LabelEncoder()
le_industry_type = LabelEncoder()
le_work_type = LabelEncoder()

# Encode categorical features
df['state_encoded'] = le_state.fit_transform(df['state'])
df['industry_type_encoded'] = le_industry_type.fit_transform(df['industry_type'])
df['formatted_work_type_encoded'] = le_work_type.fit_transform(df['formatted_work_type'])

# Prepare features (X) and target (y)
X_new = df[['state_encoded', 'industry_type_encoded', 'formatted_work_type_encoded']]
y_new = df['normalized_salary']

# Split the data into training and testing sets
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.2, random_state=42)

print(f"Shape of new X_train: {X_train_new.shape}")
print(f"Shape of new X_test: {X_test_new.shape}")
print(f"Shape of new y_train: {y_train_new.shape}")
print(f"Shape of new y_test: {y_test_new.shape}")

Shape of new X_train: (28847, 3)
Shape of new X_test: (7212, 3)
Shape of new y_train: (28847,)
Shape of new y_test: (7212,)


In [53]:
# Initialize and train the Linear Regression model with new features
linear_model_new = LinearRegression()
linear_model_new.fit(X_train_new, y_train_new)

# Make predictions on the test set
y_pred_new = linear_model_new.predict(X_test_new)

# Evaluate the new model
r2_new = r2_score(y_test_new, y_pred_new)
mae_new = mean_absolute_error(y_test_new, y_pred_new)
mse_new = mean_squared_error(y_test_new, y_pred_new)
rmse_new = np.sqrt(mse_new)

print(f"Linear Regression Model Performance (New Features):")
print(f"R-squared (R2) score: {r2_new:.4f}")
print(f"Mean Absolute Error (MAE): {mae_new:.2f}")
print(f"Mean Squared Error (MSE): {mse_new:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_new:.2f}")

Linear Regression Model Performance (New Features):
R-squared (R2) score: -0.0005
Mean Absolute Error (MAE): 210171.72
Mean Squared Error (MSE): 12048157499817.74
Root Mean Squared Error (RMSE): 3471045.59


 This model, while still having a negative R-squared (-0.0005), showed a **notable improvement in Mean Absolute Error (MAE: 210171.72)** compared to the first Linear Regression model (MAE: 454040.84).

#More advanced regression model: Gradient Boosting

Train a **GradientBoostingRegressor model** using the preprocessed data from the **Linear Regression N2** (X_train_new, y_train_new) that includes 'state', 'industry_type', and 'formatted_work_type' as features.

In [54]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the GradientBoostingRegressor model
# Setting random_state for reproducibility
gbr_model = GradientBoostingRegressor(random_state=42)

# Train the model
gbr_model.fit(X_train_new, y_train_new)

print("GradientBoostingRegressor model trained successfully.")

GradientBoostingRegressor model trained successfully.


**Reasoning**:
Now that the GradientBoostingRegressor model is trained, I need to make predictions on the test set (`X_test_new`) and evaluate its performance using R-squared, Mean Absolute Error (MAE), Mean Squared Error (MSE), and Root Mean Squared Error (RMSE).



In [55]:
y_pred_gbr = gbr_model.predict(X_test_new)

r2_gbr = r2_score(y_test_new, y_pred_gbr)
mae_gbr = mean_absolute_error(y_test_new, y_pred_gbr)
mse_gbr = mean_squared_error(y_test_new, y_pred_gbr)
rmse_gbr = np.sqrt(mse_gbr)

print(f"GradientBoostingRegressor Model Performance (New Features):")
print(f"R-squared (R2) score: {r2_gbr:.4f}")
print(f"Mean Absolute Error (MAE): {mae_gbr:.2f}")
print(f"Mean Squared Error (MSE): {mse_gbr:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_gbr:.2f}")

GradientBoostingRegressor Model Performance (New Features):
R-squared (R2) score: -0.0036
Mean Absolute Error (MAE): 237547.82
Mean Squared Error (MSE): 12085097821973.91
Root Mean Squared Error (RMSE): 3476362.73


 # Linear Regression model N2 VS Gradient Boosting Regressor model performance metrics comparison



In [56]:
print("\n--- Model Performance Comparison (New Features) ---\n")

print("Linear Regression Model (New Features):")
print(f"  R-squared (R2) score: {r2_new:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_new:.2f}")
print(f"  Mean Squared Error (MSE): {mse_new:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_new:.2f}")

print("\nGradient Boosting Regressor Model (New Features):")
print(f"  R-squared (R2) score: {r2_gbr:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_gbr:.2f}")
print(f"  Mean Squared Error (MSE): {mse_gbr:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_gbr:.2f}")


--- Model Performance Comparison (New Features) ---

Linear Regression Model (New Features):
  R-squared (R2) score: -0.0005
  Mean Absolute Error (MAE): 210171.72
  Mean Squared Error (MSE): 12048157499817.74
  Root Mean Squared Error (RMSE): 3471045.59

Gradient Boosting Regressor Model (New Features):
  R-squared (R2) score: -0.0036
  Mean Absolute Error (MAE): 237547.82
  Mean Squared Error (MSE): 12085097821973.91
  Root Mean Squared Error (RMSE): 3476362.73


## Summary:

**Which model performed better between Linear Regression (with new features) and Gradient Boosting Regressor?**
The Linear Regression model performed marginally better than the Gradient Boosting Regressor for this specific set of features. It had a slightly higher R-squared score (-0.0005 vs -0.0036) and a lower Mean Absolute Error (210,171.72 vs 237,547.82).

**What insights can be provided into the overall predictive power of the models with the chosen features?**
Both models exhibited very poor predictive power, as indicated by their negative R-squared scores. This suggests that neither model could explain the variance in the target variable (`normalized_salary`) better than simply predicting the mean of the salary. The chosen features (`state`, `industry_type`, and `formatted_work_type`) are not effective for accurately predicting `normalized_salary` with these models.

### Data Analysis Key Findings
*   Both the Linear Regression and Gradient Boosting Regressor models, when trained with 'state', 'industry_type', and 'formatted_work_type' features, resulted in negative R-squared scores (Linear Regression: -0.0005; Gradient Boosting Regressor: -0.0036). This indicates that neither model performs better than a simple baseline model that predicts the mean of the target variable.
*   The Linear Regression model showed slightly better performance with a Mean Absolute Error (MAE) of 210,171.72 compared to the Gradient Boosting Regressor's MAE of 237,547.82.
*   The Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) also mirrored this trend, with Linear Regression having slightly lower values (MSE: 12,048,157,499,817.74, RMSE: 3,471,045.59) compared to Gradient Boosting Regressor (MSE: 12,085,097,821,973.91, RMSE: 3,476,362.73).

### Insights or Next Steps
*   The current set of features (`state`, `industry_type`, `formatted_work_type`) and the applied regression models are insufficient for accurate `normalized_salary` prediction. Further feature engineering, exploration of non-linear relationships, outlier handling, or a more sophisticated modeling approach is required.
*   Investigate other potential features that might have a stronger correlation with salary, and consider exploring advanced machine learning models or deep learning techniques that can capture complex patterns in the data.


# Task
One-Hot Encode the 'state', 'industry_type', and 'formatted_work_type' columns of the DataFrame `df`.

## One-Hot Encode Categorical Features

### Subtask:
Apply One-Hot Encoding to the 'state', 'industry_type', and 'formatted_work_type' columns.


**Reasoning**:
To apply One-Hot Encoding as per the instructions, I will use `pd.get_dummies()` on the specified columns (`state`, `industry_type`, `formatted_work_type`) and store the result in a new DataFrame `df_encoded`.



In [57]:
df_encoded = pd.get_dummies(df, columns=['state', 'industry_type', 'formatted_work_type'], drop_first=True)
print("Shape of the new DataFrame after one-hot encoding:", df_encoded.shape)
print("First 5 rows of the new DataFrame with encoded features:")
print(df_encoded.head())

Shape of the new DataFrame after one-hot encoding: (36059, 434)
First 5 rows of the new DataFrame with encoded features:
   industry_id company_specialty    job_id job_benefit_type  \
0           44       real estate    921716          Unknown   
1            0           Unknown   1829192          Unknown   
2           32           Unknown  10998357          Unknown   
3            9  Civil Litigation  23221523           401(k)   
4          122           Unknown  35982263          Unknown   

   job_benefit_mention            company_name  \
0                    0   Corcoran Sawyer Smith   
1                    0                 Unknown   
2                    0  The National Exemplar    
3                    1  Abrams Fensterman, LLP   
4                    0                 Unknown   

                                               title  \
0                              Marketing Coordinator   
1                  Mental Health Therapist/Counselor   
2                        Assita

## Prepare Data for Modeling (One-Hot Encoded)

Combine the one-hot encoded features with the target variable ('normalized_salary') and split the dataset into training and testing sets.

This involves dropping irrelevant columns and ensuring all features are numerical after one-hot encoding.

In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Define the features X_onehot by dropping specified columns
columns_to_drop = [
    'state', 'industry_type', 'formatted_work_type', # Original categorical columns
    'state_encoded', 'industry_type_encoded', 'formatted_work_type_encoded', # LabelEncoded columns
    'job_id', 'description', 'title', 'company_name', 'pay_period', 'location',
    'city', 'job_posting_url', 'application_url', 'original_listed_time',
    'expiry_time', 'currency', 'compensation_type', 'company_specialty',
    'job_benefit_type', 'job_area', 'seniority', 'posting_domain', 'work_type',
    'application_type', 'normalized_salary'
]

X_onehot = df_encoded.drop(columns=columns_to_drop, errors='ignore')

# 2. Define the target variable y_onehot
y_onehot = df_encoded['normalized_salary']

# 3. Split the X_onehot and y_onehot into training and testing sets
X_train_onehot, X_test_onehot, y_train_onehot, y_test_onehot = train_test_split(
    X_onehot, y_onehot, test_size=0.2, random_state=42
)

# 4. Print the shapes of the training and testing sets
print(f"Shape of X_train_onehot: {X_train_onehot.shape}")
print(f"Shape of X_test_onehot: {X_test_onehot.shape}")
print(f"Shape of y_train_onehot: {y_train_onehot.shape}")
print(f"Shape of y_test_onehot: {y_test_onehot.shape}")

Shape of X_train_onehot: (28847, 411)
Shape of X_test_onehot: (7212, 411)
Shape of y_train_onehot: (28847,)
Shape of y_test_onehot: (7212,)


## Train Linear Regression Model (One-Hot Encoded Features)
Initialize and train a Linear Regression model using the data with one-hot encoded features. This will allow us to assess if a more appropriate encoding of categorical data can improve model performance, even with a linear model.


In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Define the target variable
y_onehot = df_encoded['normalized_salary']

# Identify all object (string) columns in the df_encoded DataFrame, excluding the target
object_columns = df_encoded.select_dtypes(include='object').columns.tolist()

# The original categorical columns from which one-hot encoded features were created need to be removed,
# as well as the label-encoded columns if they are still present and not needed.
# Additionally, other non-feature object columns like job_id, description, title, etc., must be dropped.
columns_to_remove_from_features = [
    'state', 'industry_type', 'formatted_work_type', # Original categorical columns
    'state_encoded', 'industry_type_encoded', 'formatted_work_type_encoded', # LabelEncoded columns
    'job_id', 'description', 'title', ' company_name', 'pay_period', 'location',
    'city', 'job_posting_url', 'application_url', 'original_listed_time',
    'expiry_time', 'currency', 'compensation_type', 'company_specialty',
    'job_benefit_type', 'job_area', 'seniority', 'posting_domain', 'work_type',
    'application_type',
    'normalized_salary' # Target variable, so exclude from features
] + [col for col in object_columns if col not in ['state', 'industry_type', 'formatted_work_type']]

# Drop redundant/non-numeric columns to create X_onehot
X_onehot = df_encoded.drop(columns=columns_to_remove_from_features, errors='ignore')

# Ensure all remaining columns in X_onehot are numeric. This step converts any remaining non-numeric
# columns that might have slipped through (e.g., if get_dummies didn't apply to a column by error)
# into a numeric representation, often NaN, which then needs to be handled (e.g., filled).
# For this context, we ensure columns are numeric for the model, which will raise an error if mixed types exist.
# A more robust solution might convert relevant columns to numeric with errors='coerce' and then impute NaNs.
# However, given the ValueError 'Intrado', the issue is primarily string columns.

# Verify that all columns in X_onehot are indeed numeric (or boolean which can be cast to int/float)
for col in X_onehot.columns:
    if X_onehot[col].dtype == 'object':
        # If any object columns remain, they need explicit handling or removal
        # For now, we expect them all to be handled by the columns_to_remove_from_features list
        print(f"Warning: Object column '{col}' still present in X_onehot. Values: {X_onehot[col].unique()}")

# Split the X_onehot and y_onehot into training and testing sets
X_train_onehot, X_test_onehot, y_train_onehot, y_test_onehot = train_test_split(
    X_onehot, y_onehot, test_size=0.2, random_state=42
)

# Print the shapes of the training and testing sets
print(f"Shape of X_train_onehot: {X_train_onehot.shape}")
print(f"Shape of X_test_onehot: {X_test_onehot.shape}")
print(f"Shape of y_train_onehot: {y_train_onehot.shape}")
print(f"Shape of y_test_onehot: {y_test_onehot.shape}")


Shape of X_train_onehot: (28847, 410)
Shape of X_test_onehot: (7212, 410)
Shape of y_train_onehot: (28847,)
Shape of y_test_onehot: (7212,)


# Train the Linear Regression model with the one-hot encoded features`



In [61]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
linear_model_onehot = LinearRegression()

# Train the model using the one-hot encoded training data
linear_model_onehot.fit(X_train_onehot, y_train_onehot)

print("Linear Regression model trained successfully with one-hot encoded features.")

Linear Regression model trained successfully with one-hot encoded features.


#Model performance evaluation on the test set

This involves making predictions and calculating various metrics such as R-squared, MAE, MSE, and RMSE.



In [62]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Make predictions on the test set using the trained model
y_pred_onehot = linear_model_onehot.predict(X_test_onehot)

# Evaluate the model
r2_onehot = r2_score(y_test_onehot, y_pred_onehot)
mae_onehot = mean_absolute_error(y_test_onehot, y_pred_onehot)
mse_onehot = mean_squared_error(y_test_onehot, y_pred_onehot)
rmse_onehot = np.sqrt(mse_onehot)

print(f"Linear Regression Model Performance (One-Hot Encoded Features):")
print(f"R-squared (R2) score: {r2_onehot:.4f}")
print(f"Mean Absolute Error (MAE): {mae_onehot:.2f}")
print(f"Mean Squared Error (MSE): {mse_onehot:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_onehot:.2f}")

Linear Regression Model Performance (One-Hot Encoded Features):
R-squared (R2) score: -0.0080
Mean Absolute Error (MAE): 311121.35
Mean Squared Error (MSE): 12138582792714.40
Root Mean Squared Error (RMSE): 3484046.90


## Compare model performance

**One-Hot Encoded Linear Regression** VS Previous - **Linear Regression (N1) and Gradient Boosting Regressor (N2)** - models


**Reasoning**:
To compare the performance metrics of the Linear Regression model with one-hot encoded features against the previous Linear Regression (N1) and Gradient Boosting Regressor (N2) models, I will print their evaluation metrics side-by-side.



In [63]:
print("\n--- Overall Model Performance Comparison ---\n")

print("Linear Regression Model (Original Features - N1):")
print(f"  R-squared (R2) score: {r2:.4f}")
print(f"  Mean Absolute Error (MAE): {mae:.2f}")
print(f"  Mean Squared Error (MSE): {mse:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse:.2f}")

print("\nRandom Forest Regressor Model (Original Features - N1):")
print(f"  R-squared (R2) score: {r2_rf:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"  Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_rf:.2f}")

print("\nLinear Regression Model (Selected Features - N2):")
print(f"  R-squared (R2) score: {r2_new:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_new:.2f}")
print(f"  Mean Squared Error (MSE): {mse_new:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_new:.2f}")

print("\nGradient Boosting Regressor Model (Selected Features - N2):")
print(f"  R-squared (R2) score: {r2_gbr:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_gbr:.2f}")
print(f"  Mean Squared Error (MSE): {mse_gbr:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_gbr:.2f}")

print("\nLinear Regression Model (One-Hot Encoded Features):")
print(f"  R-squared (R2) score: {r2_onehot:.4f}")
print(f"  Mean Absolute Error (MAE): {mae_onehot:.2f}")
print(f"  Mean Squared Error (MSE): {mse_onehot:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse_onehot:.2f}")


--- Overall Model Performance Comparison ---

Linear Regression Model (Original Features - N1):
  R-squared (R2) score: -0.1002
  Mean Absolute Error (MAE): 454040.84
  Mean Squared Error (MSE): 13248365378233.95
  Root Mean Squared Error (RMSE): 3639830.41

Random Forest Regressor Model (Original Features - N1):
  R-squared (R2) score: -1.7222
  Mean Absolute Error (MAE): 236885.04
  Mean Squared Error (MSE): 32781168099678.59
  Root Mean Squared Error (RMSE): 5725484.09

Linear Regression Model (Selected Features - N2):
  R-squared (R2) score: -0.0005
  Mean Absolute Error (MAE): 210171.72
  Mean Squared Error (MSE): 12048157499817.74
  Root Mean Squared Error (RMSE): 3471045.59

Gradient Boosting Regressor Model (Selected Features - N2):
  R-squared (R2) score: -0.0036
  Mean Absolute Error (MAE): 237547.82
  Mean Squared Error (MSE): 12085097821973.91
  Root Mean Squared Error (RMSE): 3476362.73

Linear Regression Model (One-Hot Encoded Features):
  R-squared (R2) score: -0.0080
 

## Summary:

### Q&A
Using One-Hot Encoding for categorical features (`state`, `industry_type`, and `formatted_work_type`) did not improve the Linear Regression model's performance. The model still exhibited very poor predictive power, with an R-squared score of -0.0080, indicating it performed worse than a simple horizontal line at the mean and could not capture the non-linear relationships in the data effectively.

### Data Analysis Key Findings
*   One-Hot Encoding was successfully applied to the specified categorical features, resulting in a `df_encoded` DataFrame with 434 columns.
*   The data was split into training (28,847 samples) and testing (7,212 samples) sets, with 411 features after preparing the one-hot encoded data.
*   Initial attempts to train the Linear Regression model on one-hot encoded features failed due to residual non-numeric string columns, which were subsequently identified and removed.
*   After resolving the data preparation issues, the Linear Regression model trained with one-hot encoded features yielded extremely poor performance:
    *   R-squared (R2) score: -0.0080
    *   Mean Absolute Error (MAE): \$311,121.35
    *   Mean Squared Error (MSE): \$12,138,582,792,714.40
    *   Root Mean Squared Error (RMSE): \$3,484,046.90
*   This performance is comparable to other previous linear models and the Gradient Boosting Regressor, all of which showed R-squared scores near or below zero, confirming that these models struggle with the dataset.

### Insights or Next Steps
*   The continued poor performance across various linear models, even with one-hot encoding, suggests that the underlying relationships in the data might be highly complex and non-linear, or that critical features are still missing or incorrectly represented.
*   Consider exploring more advanced non-linear models (e.g., neural networks) or more sophisticated feature engineering techniques beyond basic encoding, such as interaction terms or polynomial features, to better capture the complexities within the dataset.
