In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt

## Load Data

In [4]:
# Load the DataFrame from the CSV file
df = pd.read_csv('Data/output/df_cleaned.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785692 entries, 0 to 785691
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   job_title_short        785692 non-null  object 
 1   job_title              785692 non-null  object 
 2   job_location           785692 non-null  object 
 3   job_via                785692 non-null  object 
 4   job_schedule_type      785692 non-null  object 
 5   job_work_from_home     785692 non-null  bool   
 6   search_location        785692 non-null  object 
 7   job_posted_date        785692 non-null  object 
 8   job_no_degree_mention  785692 non-null  bool   
 9   job_health_insurance   785692 non-null  bool   
 10  job_country            785692 non-null  object 
 11  salary_year_avg        22003 non-null   float64
 12  company_name           785675 non-null  object 
 13  job_skills             785692 non-null  object 
 14  job_type_skills        785692 non-nu

In [5]:
df.isnull().sum()

job_title_short               0
job_title                     0
job_location                  0
job_via                       0
job_schedule_type             0
job_work_from_home            0
search_location               0
job_posted_date               0
job_no_degree_mention         0
job_health_insurance          0
job_country                   0
salary_year_avg          763689
company_name                 17
job_skills                    0
job_type_skills               0
num_jobs                      0
latitude                      0
longitude                     0
dtype: int64

In [20]:
df.columns

Index(['job_title_short', 'job_title', 'job_location', 'job_via',
       'job_schedule_type', 'job_work_from_home', 'search_location',
       'job_posted_date', 'job_no_degree_mention', 'job_health_insurance',
       'job_country', 'salary_year_avg', 'company_name', 'job_skills',
       'job_type_skills', 'num_jobs', 'latitude', 'longitude'],
      dtype='object')

In [21]:
pip install streamlit pandas prophet matplotlib


Note: you may need to restart the kernel to use updated packages.


In [22]:
df.columns

Index(['job_title_short', 'job_title', 'job_location', 'job_via',
       'job_schedule_type', 'job_work_from_home', 'search_location',
       'job_posted_date', 'job_no_degree_mention', 'job_health_insurance',
       'job_country', 'salary_year_avg', 'company_name', 'job_skills',
       'job_type_skills', 'num_jobs', 'latitude', 'longitude'],
      dtype='object')

In [23]:
import pandas as pd
from prophet import Prophet
import logging

# Suppress cmdstanpy logs
logging.getLogger('cmdstanpy').setLevel(logging.CRITICAL)

# Load your data
# df = pd.read_csv('job_data.csv')

# Convert date column to datetime
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

# Aggregate data by month, job type, and job_country
df['month'] = df['job_posted_date'].dt.to_period('M').astype(str)
aggregated_df = df.groupby(['month', 'job_title_short', 'job_country', 'latitude', 'longitude']).agg({'num_jobs': 'sum'}).reset_index()

# Prepare the data for Prophet
aggregated_df.rename(columns={'month': 'ds', 'num_jobs': 'y'}, inplace=True)
aggregated_df['ds'] = pd.to_datetime(aggregated_df['ds'])

# Initialize an empty list to store predictions
predictions = []

# Initialize a counter for skipped groups
skipped_groups = 0
max_skipped_messages = 10  # Limit the number of "skipping" messages

# Train the model for each job type and location
for (job_title, job_country, lat, lon), group in aggregated_df.groupby(['job_title_short', 'job_country', 'latitude', 'longitude']):
    # Check if the group has at least 2 non-NaN rows
    if group['y'].notnull().sum() < 2:
        if skipped_groups < max_skipped_messages:
            print(f"Skipping {job_title} in {job_country} at ({lat}, {lon}) due to insufficient data.")
        elif skipped_groups == max_skipped_messages:
            print("Skipping additional groups due to insufficient data...")
        skipped_groups += 1
        continue
    
    # Create a new Prophet model for each iteration
    model = Prophet()
    
    # Fit the model
    model.fit(group)
    
    # Create a future dataframe for 12 months using 'ME' frequency
    future = model.make_future_dataframe(periods=12, freq='ME')
    
    # Predict future values
    forecast = model.predict(future)
    
    # Add job title, job_country, and coordinates to the forecast
    forecast['job_title_short'] = job_title
    forecast['job_country'] = job_country
    forecast['latitude'] = lat
    forecast['longitude'] = lon
    
    # Append the forecast to the predictions list
    predictions.append(forecast[['ds', 'yhat', 'job_title_short', 'job_country', 'latitude', 'longitude']])

# Combine all the predictions into a single DataFrame
predicted_df = pd.concat(predictions, ignore_index=True)

# Rename the predicted column and ensure no negative predictions
predicted_df.rename(columns={'yhat': 'num_jobs'}, inplace=True)
predicted_df['num_jobs'] = predicted_df['num_jobs'].apply(lambda x: max(0, int(x)))

# Combine with the existing data
combined_df = pd.concat([aggregated_df, predicted_df])

# Save the combined DataFrame to a CSV file or use it in your Streamlit app
combined_df.to_csv('combined_predictions.csv', index=False)

print("Processing completed.")


Skipping Business Analyst in Angola at (-12.5, 18.5) due to insufficient data.
Skipping Business Analyst in Bolivia at (-17.0, -65.0) due to insufficient data.
Skipping Business Analyst in Djibouti at (11.833333333333334, 42.5) due to insufficient data.
Skipping Business Analyst in Laos at (18.0, 105.0) due to insufficient data.
Skipping Business Analyst in Mali at (18.0, -2.0) due to insufficient data.
Skipping Business Analyst in Montenegro at (42.75, 19.25) due to insufficient data.
Skipping Business Analyst in Somalia at (6.0, 48.0) due to insufficient data.
Skipping Cloud Engineer in Albania at (41.0, 20.0) due to insufficient data.
Skipping Cloud Engineer in Bolivia at (-17.0, -65.0) due to insufficient data.
Skipping Cloud Engineer in Burkina Faso at (12.5, -1.6666666666666663) due to insufficient data.
Skipping additional groups due to insufficient data...
Processing completed.
