## Load dependencies and dataset

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error

file_path = "\\cleaned_data.csv"
data = pd.read_csv(file_path)


print(data.head())
print(data.columns)

  date_sourced  company_name  \
0   2024-01-01  PRIOjet GmbH   
1   2024-01-01    Bettermile   
2   2024-01-01     DOSarrest   
3   2024-01-02      Europace   
4   2024-01-02     ticketbro   

                                            position  \
0                       software Developer fullstack   
1           fullstack Engineer - kotlin, vue.js, aws   
2                            devops Engineer cdn dns   
3                             devops Engineer remote   
4  fullstack Engineer - react native, react.js, a...   

                                     job_description          location  \
0  Hi! Were happy that youre here \r\n\r\n PRIOje...   Frankfurt am Ma   
1  At the moment we only proceed with candidates ...  Berlin-Kreuzberg   
2  A multi-billion IT security market and Link11 ...   Frankfurt am Ma   
3  At Europace, we have been enabling people to m...            Berlin   
4  INTRO\r\nHere at ticketbro we bring the smarte...            Berlin   

  contract_type language 

In [17]:
aggregated_data = data.groupby('position').size().reset_index(name='count')

print(aggregated_data.head())

                                        position  count
0           " software Developer ecm dms w m d "      1
1          "backend Developer*in ruby on rails "      1
2                       "java software Developer      1
3                  #1 founding frontend Engineer      3
4  #englishspeaking! fullstack Developer gen. ai      1


## Target encode position column for easier handling 

In [18]:
# Store original positions to convert them back later
original_positions = aggregated_data['position']

# Target encode the positions
categorical_columns = ['position']
encoder = TargetEncoder(cols=categorical_columns)
encoded_data = encoder.fit_transform(aggregated_data, aggregated_data['count'])


## Verify all columns and values are numeric

In [19]:
print(encoded_data.dtypes)

non_numeric_columns = encoded_data.select_dtypes(include=['object']).columns
if len(non_numeric_columns) > 0:
    for col in non_numeric_columns:
        print(f"Non-numeric column '{col}' unique values: {encoded_data[col].unique()}")
else:
    print("All columns are numeric.")

position    float64
count         int64
dtype: object
All columns are numeric.


## Split Data

In [20]:
X = encoded_data.drop('count', axis=1)
y = encoded_data['count']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



Random Forest Mean Squared Error: 0.03036734417344173
                                      position  predicted
740                         software Developer     285.72
282                    senior backend Engineer     219.40
830                            devops Engineer     210.88
654                          android Developer     131.58
2613                  senior frontend Engineer     121.34
1600                          backend Engineer     116.58
1643                             web Developer     103.96
653      senior team lead of quality assurance      99.17
2578                           project Manager      73.26
2847                             app Developer      65.98
1412                                hr Manager      61.43
2854                            java Developer      60.45
2746                          mobile Developer      60.15
2278  salesforce crm software Engineering lead      48.11
2724                    frontend web Developer      43.54
512               

## Train RF Model and Create Prediction

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f"Random Forest Mean Squared Error: {mse}")

# Create a dataframe to store predicted values
results = pd.DataFrame({'position': X_test.index, 'predicted': predictions})

# Map back to original positions
results['position'] = results['position'].map(original_positions)

results_sorted = results.sort_values(by='predicted', ascending=False)

top_20_results = results_sorted.head(20)

print(top_20_results)

Let's explore how it goes with linear regression model

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error

# Load your dataset
file_path = "C:\\Users\\acer\\Documents\\data_analysis_project\\cleaned_data.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

# Print the column names to verify the presence of 'count'
print(data.columns)

# Ensure 'count' column is present
if 'count' not in data.columns:
    data['count'] = 1


  date_sourced  company_name  \
0   2024-01-01  PRIOjet GmbH   
1   2024-01-01    Bettermile   
2   2024-01-01     DOSarrest   
3   2024-01-02      Europace   
4   2024-01-02     ticketbro   

                                            position  \
0                       software Developer fullstack   
1           fullstack Engineer - kotlin, vue.js, aws   
2                            devops Engineer cdn dns   
3                             devops Engineer remote   
4  fullstack Engineer - react native, react.js, a...   

                                     job_description          location  \
0  Hi! Were happy that youre here \r\n\r\n PRIOje...   Frankfurt am Ma   
1  At the moment we only proceed with candidates ...  Berlin-Kreuzberg   
2  A multi-billion IT security market and Link11 ...   Frankfurt am Ma   
3  At Europace, we have been enabling people to m...            Berlin   
4  INTRO\r\nHere at ticketbro we bring the smarte...            Berlin   

  contract_type language 

In [3]:
unique_positions_counts = data['position'].value_counts().reset_index()
unique_positions_counts.columns = ['position', 'count']

print(unique_positions_counts.head())


                    position  count
0        fullstack Developer    581
1         frontend Developer    502
2          software Engineer    338
3         software Developer    286
4  senior frontend Developer    225


In [4]:
# Store original positions for later conversion back
original_positions = unique_positions_counts['position']

# Encode all categorical variables using Target Encoding
categorical_columns = ['position']
encoder = TargetEncoder(cols=categorical_columns)
encoded_data = encoder.fit_transform(unique_positions_counts, unique_positions_counts['count'])


In [5]:
# Split data into features (X) and target (y)
X = encoded_data.drop('count', axis=1)
y = encoded_data['count']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# Split data into features (X) and target (y)
X = encoded_data.drop('count', axis=1)
y = encoded_data['count']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [7]:
# Train Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict and evaluate
linear_predictions = linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, linear_predictions)
print(f"Linear Regression Mean Squared Error: {linear_mse}")

# Create a dataframe to store predicted values
linear_results = pd.DataFrame({'position': X_test.index, 'predicted': linear_predictions})

# Map back to original positions
linear_results['position'] = linear_results['position'].map(original_positions)

# Sort the results by predicted counts in descending order
linear_results_sorted = linear_results.sort_values(by='predicted', ascending=False)

# Select the top 20 job positions with the highest predicted counts
linear_top_20_results = linear_results_sorted.head(20)

# Print the top 20 results
print(linear_top_20_results)


Linear Regression Mean Squared Error: 5.4643048068967646e-30
                          position  predicted
481            fullstack Developer      581.0
341             software Developer      286.0
1991                 php Developer      195.0
924             senior qa Engineer      148.0
1574            fullstack Engineer      143.0
475                    qa Engineer      130.0
1785      senior frontend Engineer      119.0
1070                 web Developer      111.0
660           senior php Developer       98.0
64               frontend Engineer       75.0
1896               project Manager       72.0
2003                data scientist       67.0
12      quality assurance Engineer       62.0
583                 java Developer       60.0
1586              mobile Developer       59.0
45       backend software Engineer       52.0
2565  fullstack software Developer       50.0
2113        frontend web Developer       44.0
1895                cloud Engineer       43.0
608            mech