# Data Analysis of Layoffs in the United States Tech Industry 

## Project Motivation and Background

As Computer Science students about to enter the job market, we're concerned about the volatility of the tech industry. We want to analyze and create a system that can help people understand the markets, plan an exit strategy, and alleviate these concerns.

## Project Goal:
The goal of our project is to analyze trends in companies' recent layoffs in a variety of industries (aerospace, travel, retail, etc.) and detect patterns and trends.  This will be done by looking at the number of employees laid off, the location of the companies, their stages, and the funds they have raised.




In [1]:
import pandas as pd
import numpy as np

# Read in the data
layoffs = pd.read_csv('layoffs.csv')
layoffs.head()
print(layoffs.columns)
#we have to drop all rows with a blank percentage layed off cell
layoffs.dropna(subset=['percentage_laid_off'], inplace=True)

# one hot encoding for categorical variables 
print(f"Unique values for 'company': {len(layoffs['company'].unique())}")

# Adding dummy variables for location, industry, stage, and country
totalNewCols = len(layoffs['location'].unique()) + len(layoffs['industry'].unique()) + len(layoffs['stage'].unique()) + len(layoffs['country'].unique())
print(f"Total number of new columns: {totalNewCols}")

#loca = pd.get_dummies(layoffs['location'], prefix='location')
indu = pd.get_dummies(layoffs['industry'], prefix='industry')
stag = pd.get_dummies(layoffs['stage'], prefix='stage')
#coun = pd.get_dummies(layoffs['country'], prefix='country')

# drop the original columns
layoffs.drop(['location', 'industry', 'stage', 'country'], axis=1, inplace=True)

# concat the new columns
#layoffs = pd.concat([layoffs, loca, indu, stag, coun], axis=1)
layoffs = pd.concat([layoffs, indu, stag], axis=1)
layoffs = layoffs[layoffs.stage_Unknown != 1]
layoffs = layoffs[layoffs.industry_Other != 1]


layoffs.head()

Index(['company', 'location', 'industry', 'total_laid_off',
       'percentage_laid_off', 'date', 'stage', 'country', 'funds_raised'],
      dtype='object')
Unique values for 'company': 1438
Total number of new columns: 256


Unnamed: 0,company,total_laid_off,percentage_laid_off,date,funds_raised,industry_Aerospace,industry_Construction,industry_Consumer,industry_Crypto,industry_Data,...,stage_Series C,stage_Series D,stage_Series E,stage_Series F,stage_Series G,stage_Series H,stage_Series I,stage_Series J,stage_Subsidiary,stage_Unknown
0,N26,71.0,0.04,2023-04-28,1700.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Vroom,120.0,0.11,2023-04-27,1300.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Greenhouse,100.0,0.12,2023-04-27,110.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Megaport,50.0,0.16,2023-04-27,98.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Airtasker,45.0,0.2,2023-04-27,26.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [2]:
def categorize_predictions(y_vec):
    """
    Discretize a continuous layoff vector into distinct risk categories.
    """
    # 0.00 - 0.17 = 0 # Lower Risk
    # 0.17 - 0.25 = 1 # Medium Risk
    # 0.25 - 0.37 = 2 # High Risk
    # 0.37 - *    = 3 # Very High Risk
    thresholds = [0.17, 0.25, 0.37]
    shape = y_vec.shape[0], len(thresholds) + 1
    rval = np.zeros(shape)
    for i,t in enumerate(thresholds):
        rval[:, i] = y_vec < t
    rval[:, -1] = y_vec > t
    return pd.DataFrame(rval, columns=['risk_low', 'risk_med', 'risk_high', 'risk_very_high'])

In [3]:
traing_data = layoffs.drop(['company', 'date'], axis=1)

# Split the data into training and testing sets
train_set = traing_data.sample(frac=0.8, random_state=0)
test_set = traing_data.drop(train_set.index)

print (f"Training set shape: {train_set.shape}")
print (f"Testing set shape: {test_set.shape}")

Training set shape: (1074, 48)
Testing set shape: (268, 48)


In [41]:
import tensorflow
tensorflow.config.set_visible_devices([], 'GPU') # GPU is broken rn oops
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from sklearn.svm import SVR
from sklearn.linear_model import ridge_regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from numpy import arange
from sklearn.metrics import r2_score


#### Preprocess
# establish training set
X_train = train_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)
X_train = X_train.fillna(0)  # fill in blank cells

#labels for our training set
y_train = train_set["percentage_laid_off"]

# Scale/shift data:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Perform PCA to reduce dimensionality:
# This doesn't help; each new PCA dimension captures essentially the same
# amount of variance.
# pca = PCA(n_components=30, random_state=2684711)
# pca.fit(X_train_scaled)
# X_pca = pca.transform(X_train_scaled)

# put values and labels into a csv file to look at
X_train.to_csv('training_X.csv', index=False)
y_train.to_csv('training_Y.csv', index=False)


### Model Definition and Fitting
## Ridge Regression model:
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# # define model
# model = RidgeCV(alphas=arange(0.01, 1.0, 0.01), cv=cv)
# # fit model (this line is what takes so long to run)
# model.fit(X_train_scaled, y_train)

## Neural network model:
# model = keras.Sequential([
#     layers.Input(shape=X_train_scaled.shape[1:]),
#     layers.Dense(10, kernel_regularizer=regularizers.L1L2()),
#     layers.Dense(10, kernel_regularizer=regularizers.L1L2()),
#     layers.Dense(10, kernel_regularizer=regularizers.L1L2()),
# #     layers.Dropout(.5),
# #     layers.Dense(50),
# #     layers.Dropout(.5),
#     layers.Dense(1)
# ])
# model.summary()
# model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])
# model.fit(X_train_scaled, y_train, epochs=5, validation_split=0.1, batch_size=10)

## SVR model:
model = SVR(kernel='rbf', C=100, gamma=.00009, epsilon=0.005)
model.fit(X_train_scaled, y_train)

### Validation and metrics collection
#code to do validation with our test set
X_test = test_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)
X_test = X_test.fillna(0)
X_test_scaled = scaler.transform(X_test)

y_test = test_set["percentage_laid_off"]
y_predict = model.predict(X_test_scaled)

X_test.to_csv('test_X.csv',index=False)
y_test.to_csv('test_Y.csv',index=False)

y_predict_df = pd.DataFrame(y_predict, columns=['percentage_laid_off'])
score = r2_score(y_test, y_predict)
print(f'R ** 2 score: {score}')

var = explained_variance_score(y_test, y_predict)
print(f'variance: {var}')   

mse = mean_squared_error(y_test, y_predict)
print(f'mse: {mse}')

mae = mean_absolute_error(y_test, y_predict)
print(f'mae: {mae}')

y_test_cat = categorize_predictions(y_test)
y_pred_cat = categorize_predictions(y_predict.flatten())

print()
res = np.all(y_test_cat == y_pred_cat, axis=1)
print(f'Categorical accuracy: {res.sum()/res.shape[0]} ({res.sum()}/{res.shape[0]})')
print(f"Micro-averaged F1 score: {f1_score(y_test_cat, y_pred_cat, average='micro')}")
print(f"Macro-averaged F1 score: {f1_score(y_test_cat, y_pred_cat, average='macro')}")
print(f"R^2: {r2_score(y_test_cat, y_pred_cat)}")

def to_numeric(y_df):
    return pd.DataFrame(np.argmax(y_df.to_numpy(), axis=1) + 1, columns=['risk'])

print(f"Categorical predictions compated to actual: ")
print(pd.concat([to_numeric(y_pred_cat).head(20), to_numeric(y_test_cat.reset_index(drop=True)).head(20)], axis=1))
print("Raw predictions compared to actual:")
print(pd.concat([y_predict_df.head(20), y_test.reset_index(drop=True).head(20)], axis=1))

R ** 2 score: 0.2862418829588602
variance: 0.3299445079835237
mse: 0.04133577082898896
mae: 0.12793103159905755

Categorical accuracy: 0.503731343283582 (135/268)
Micro-averaged F1 score: 0.8075657894736843
Macro-averaged F1 score: 0.6874835056879944
R^2: -0.1545937148797224
Categorical predictions compated to actual: 
    risk  risk
0      3     4
1      1     3
2      2     2
3      3     2
4      4     4
5      1     4
6      2     1
7      2     2
8      2     2
9      2     4
10     1     1
11     1     1
12     1     3
13     2     2
14     1     1
15     1     1
16     1     1
17     4     1
18     1     1
19     2     1
Raw predictions compared to actual:
    percentage_laid_off  percentage_laid_off
0              0.255912                 0.40
1              0.157326                 0.25
2              0.171619                 0.20
3              0.319315                 0.17
4              0.553271                 1.00
5              0.134726                 0.40
6            

In [42]:
# Save trained model and scalers:
import pickle
with open("model.pkl", 'wb') as f:
    pickle.dump(model, f)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print(X_train.columns)

Index(['funds_raised', 'industry_Aerospace', 'industry_Construction',
       'industry_Consumer', 'industry_Crypto', 'industry_Data',
       'industry_Education', 'industry_Energy', 'industry_Finance',
       'industry_Fitness', 'industry_Food', 'industry_HR', 'industry_Hardware',
       'industry_Healthcare', 'industry_Infrastructure', 'industry_Legal',
       'industry_Logistics', 'industry_Manufacturing', 'industry_Marketing',
       'industry_Media', 'industry_Other', 'industry_Product',
       'industry_Real Estate', 'industry_Recruiting', 'industry_Retail',
       'industry_Sales', 'industry_Security', 'industry_Support',
       'industry_Transportation', 'industry_Travel', 'stage_Acquired',
       'stage_Post-IPO', 'stage_Private Equity', 'stage_Seed',
       'stage_Series A', 'stage_Series B', 'stage_Series C', 'stage_Series D',
       'stage_Series E', 'stage_Series F', 'stage_Series G', 'stage_Series H',
       'stage_Series I', 'stage_Series J', 'stage_Subsidiary',
       's