# Data Analysis of Layoffs in the United States Tech Industry 

## Project Motivation and Background

As Computer Science students about to enter the job market, we're concerned about the volatility of the tech industry. We want to analyze and create a system that can help people understand the markets, plan an exit strategy, and alleviate these concerns.

## Project Goal:
The goal of our project is to analyze trends in companies' recent layoffs in a variety of industries (aerospace, travel, retail, etc.) and detect patterns and trends.  This will be done by looking at the number of employees laid off, the location of the companies, their stages, and the funds they have raised.




In [1]:
import pandas as pd
import numpy as np

# Read in the data
layoffs = pd.read_csv('layoffs.csv')
layoffs.head()
print(layoffs.columns)
#we have to drop all rows with a blank percentage layed off cell
layoffs.dropna(subset=['percentage_laid_off'], inplace=True)

# one hot encoding for categorical variables 
print(f"Unique values for 'company': {len(layoffs['company'].unique())}")

# Adding dummy variables for location, industry, stage, and country
totalNewCols = len(layoffs['location'].unique()) + len(layoffs['industry'].unique()) + len(layoffs['stage'].unique()) + len(layoffs['country'].unique())
print(f"Total number of new columns: {totalNewCols}")

#loca = pd.get_dummies(layoffs['location'], prefix='location')
indu = pd.get_dummies(layoffs['industry'], prefix='industry')
stag = pd.get_dummies(layoffs['stage'], prefix='stage')
#coun = pd.get_dummies(layoffs['country'], prefix='country')

# drop the original columns
layoffs.drop(['location', 'industry', 'stage', 'country'], axis=1, inplace=True)

# concat the new columns
#layoffs = pd.concat([layoffs, loca, indu, stag, coun], axis=1)
layoffs = pd.concat([layoffs, indu, stag], axis=1)
layoffs = layoffs[layoffs.stage_Unknown != 1]
layoffs = layoffs[layoffs.industry_Other != 1]


layoffs.head()


Index(['company', 'location', 'industry', 'total_laid_off',
       'percentage_laid_off', 'date', 'stage', 'country', 'funds_raised'],
      dtype='object')
Unique values for 'company': 1438
Total number of new columns: 256


Unnamed: 0,company,total_laid_off,percentage_laid_off,date,funds_raised,industry_Aerospace,industry_Construction,industry_Consumer,industry_Crypto,industry_Data,...,stage_Series C,stage_Series D,stage_Series E,stage_Series F,stage_Series G,stage_Series H,stage_Series I,stage_Series J,stage_Subsidiary,stage_Unknown
0,N26,71.0,0.04,2023-04-28,1700.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Vroom,120.0,0.11,2023-04-27,1300.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Greenhouse,100.0,0.12,2023-04-27,110.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Megaport,50.0,0.16,2023-04-27,98.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Airtasker,45.0,0.2,2023-04-27,26.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [2]:
traing_data = layoffs.drop(['company', 'date'], axis=1)

# Split the data into training and testing sets
train_set = traing_data.sample(frac=0.8, random_state=0)
test_set = traing_data.drop(train_set.index)

print (f"Training set shape: {train_set.shape}")
print (f"Testing set shape: {test_set.shape}")

Training set shape: (1074, 48)
Testing set shape: (268, 48)


In [None]:
from sklearn.linear_model import ridge_regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from numpy import arange
from sklearn.metrics import r2_score


# print(f'train cols: {train_set.columns}')
# establish training set
#values for our training set
X_train = train_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)

#fill in blank cells
X_train = X_train.fillna(0)

#labels for our training set
y_train = train_set["percentage_laid_off"]

#define cross-validation method to evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

#define model
model = RidgeCV(alphas=arange(0.001, 1.0, 0.001), cv=cv)
#put values and labels into a csv file to look at
X_train.to_csv('training_X.csv', index=False)
y_train.to_csv('training_Y.csv', index=False)

#fit model (this line is what takes so long to run)
model.fit(X_train_scaled, y_train)


In [4]:

#the lambda that produced the lowest test MSE


#code to do validation with our test set
X_test = test_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)
X_test = X_test.fillna(0)
scaler = StandardScaler()
X_train_test = scaler.fit_transform(X_test)

y_test = test_set["percentage_laid_off"]
y_predict = model.predict(X_train_test)

X_test.to_csv('test_X.csv',index=False)
y_test.to_csv('test_Y.csv',index=False)

y_predict_df = pd.DataFrame(y_predict, columns=['percentage_laid_off'])
score = r2_score(y_test, y_predict)
print(f'R ** 2 score: {score}')


from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score


var = explained_variance_score(y_test, y_predict)
print(f'variance: {var}')   

mse = mean_squared_error(y_test, y_predict)
print(f'mse: {mse}')

mae = mean_absolute_error(y_test, y_predict)
print(f'mae: {mae}')



R ** 2 score: 0.30480895857786616
variance: 0.3071394706300361
mse: 0.04026049845810046
mae: 0.14663207695160854


In [6]:
print("Predictions compared to actual:")
print(pd.concat([y_predict_df.head(20), y_test.reset_index(drop=True).head(20)], axis=1))


Predictions compared to actual:
    percentage_laid_off  percentage_laid_off
0              0.385498                 0.40
1              0.247519                 0.25
2              0.240291                 0.20
3              0.394545                 0.17
4              0.649684                 1.00
5              0.161085                 0.40
6              0.263073                 0.10
7              0.240121                 0.20
8              0.317366                 0.20
9              0.247451                 0.45
10             0.187187                 0.16
11             0.118454                 0.02
12             0.171499                 0.25
13             0.155730                 0.20
14             0.227644                 0.14
15             0.218486                 0.08
16             0.095426                 0.03
17             0.478469                 0.11
18             0.092234                 0.08
19             0.244628                 0.14


In [30]:

# Giving a label classifier to the percentage laid off

# Statistacl analysis of the data find the mean and standard deviation
mean = layoffs['percentage_laid_off'].mean()
std = layoffs['percentage_laid_off'].std()
median = layoffs['percentage_laid_off'].median()

print (f"Mean: {mean}")
print (f"Standard Deviation: {std}")
print (f"Median: {median}")

# Given median and standard deviation, we can classify the percentage laid off into 3 categories

# Mean: 0.25194299552906113
# Standard Deviation: 0.24712653922661823
# Median: 0.17

# 0.00 - 0.17 = 0 # Lower Risk
# 0.17 - 0.25 = 1 # Medium Risk
# 0.25 - 0.37 = 2 # High Risk
# 0.37 - *    = 3 # Very High Risk

# Create a new column for the label
y_predict_df['risk'] = 0
y_test['risk'] = 0


# Iterate through the rows and assign the label
for index, row in y_predict_df.iterrows():
    if row['percentage_laid_off'] < 0.17:
        y_predict_df.at[index, 'risk'] = 0
    elif row['percentage_laid_off'] < 0.25:
        y_predict_df.at[index, 'risk'] = 1
    elif row['percentage_laid_off'] < 0.37:
        y_predict_df.at[index, 'risk'] = 2
    else:
        y_predict_df.at[index, 'risk'] = 3

for index, row in y_test.iteritems():
    if row['percentage_laid_off'] < 0.17:
        y_test.at[index, 'risk'] = 0
    elif row['percentage_laid_off'] < 0.25:
        y_test.at[index, 'risk'] = 1
    elif row['percentage_laid_off'] < 0.37:
        y_test.at[index, 'risk'] = 2
    else:
        y_test.at[index, 'risk'] = 3

r2 = r2_score(y_test['risk'], y_predict_df['risk'])
print(f'R ** 2 score: {r2}')


# compare the 

Mean: 0.25194299552906113
Standard Deviation: 0.24712653922661823
Median: 0.17


  for index, row in y_test.iteritems():


TypeError: 'int' object is not subscriptable

R ** 2 score: -2.7053853375437398
Predictions compared to actual:
    percentage_laid_off  risk  risk
0              1.601971   3.0     3
1              0.951865   3.0     2
2              1.053346   3.0     1
3              1.802433   3.0     1
4              2.525088   3.0     3
5              0.581978   3.0     3
6              1.030918   3.0     0
7              1.052504   3.0     1
8              1.505438   3.0     1
9              0.953494   3.0     3
10             0.615540   3.0     0
11             0.068217   0.0     0
12             0.642373   3.0     2
13             0.692333   3.0     1
14             0.872703   3.0     0
15             0.727664   3.0     0
16             0.237083   1.0     0
17             2.189183   3.0     0
18             0.234215   1.0     0
19             1.132783   3.0     0
