# Data Analysis of Layoffs in the United States Tech Industry 

## Project Motivation and Background

As Computer Science students about to enter the job market, we're concerned about the volatility of the tech industry. We want to analyze and create a system that can help people understand the markets, plan an exit strategy, and alleviate these concerns.

## Project Goal:
The goal of our project is to analyze trends in companies' recent layoffs in a variety of industries (aerospace, travel, retail, etc.) and detect patterns and trends.  This will be done by looking at the number of employees laid off, the location of the companies, their stages, and the funds they have raised.




In [102]:
import pandas as pd
import numpy as np

# Read in the data
layoffs = pd.read_csv('layoffs.csv')
layoffs.head()
print(layoffs.columns)
#we have to drop all rows with a blank percentage layed off cell
layoffs.dropna(subset=['percentage_laid_off'], inplace=True)

# one hot encoding for categorical variables 
print(f"Unique values for 'company': {len(layoffs['company'].unique())}")

# Adding dummy variables for location, industry, stage, and country
totalNewCols = len(layoffs['location'].unique()) + len(layoffs['industry'].unique()) + len(layoffs['stage'].unique()) + len(layoffs['country'].unique())
print(f"Total number of new columns: {totalNewCols}")

#loca = pd.get_dummies(layoffs['location'], prefix='location')
indu = pd.get_dummies(layoffs['industry'], prefix='industry')
stag = pd.get_dummies(layoffs['stage'], prefix='stage')
#coun = pd.get_dummies(layoffs['country'], prefix='country')

# drop the original columns
layoffs.drop(['location', 'industry', 'stage', 'country'], axis=1, inplace=True)

# concat the new columns
#layoffs = pd.concat([layoffs, loca, indu, stag, coun], axis=1)
layoffs = pd.concat([layoffs, indu, stag], axis=1)
layoffs = layoffs[layoffs.stage_Unknown != 1]
layoffs = layoffs[layoffs.industry_Other != 1]


layoffs.info()
layoffs.head()


Index(['company', 'location', 'industry', 'total_laid_off',
       'percentage_laid_off', 'date', 'stage', 'country', 'funds_raised'],
      dtype='object')
Unique values for 'company': 1438
Total number of new columns: 256
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342 entries, 0 to 2541
Data columns (total 50 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   company                  1342 non-null   object 
 1   total_laid_off           1031 non-null   float64
 2   percentage_laid_off      1342 non-null   float64
 3   date                     1342 non-null   object 
 4   funds_raised             1261 non-null   float64
 5   industry_Aerospace       1342 non-null   uint8  
 6   industry_Construction    1342 non-null   uint8  
 7   industry_Consumer        1342 non-null   uint8  
 8   industry_Crypto          1342 non-null   uint8  
 9   industry_Data            1342 non-null   uint8  
 10  industry_Educati

Unnamed: 0,company,total_laid_off,percentage_laid_off,date,funds_raised,industry_Aerospace,industry_Construction,industry_Consumer,industry_Crypto,industry_Data,...,stage_Series C,stage_Series D,stage_Series E,stage_Series F,stage_Series G,stage_Series H,stage_Series I,stage_Series J,stage_Subsidiary,stage_Unknown
0,N26,71.0,0.04,2023-04-28,1700.0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Vroom,120.0,0.11,2023-04-27,1300.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Greenhouse,100.0,0.12,2023-04-27,110.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Megaport,50.0,0.16,2023-04-27,98.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Airtasker,45.0,0.2,2023-04-27,26.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [103]:
traing_data = layoffs.drop(['company', 'date'], axis=1)

# Split the data into training and testing sets
train_set = traing_data.sample(frac=0.8, random_state=0)
test_set = traing_data.drop(train_set.index)

print (f"Training set shape: {train_set.shape}")
print (f"Testing set shape: {test_set.shape}")

Training set shape: (1074, 48)
Testing set shape: (268, 48)


In [104]:
from sklearn.linear_model import ridge_regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from numpy import arange
from sklearn.metrics import r2_score


# print(f'train cols: {train_set.columns}')
# establish training set
#values for our training set
X_train = train_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)

#fill in blank cells
#X_train = X_train.fillna(0)

X_train.dropna()

#labels for our training set
y_train = train_set["percentage_laid_off"]

#define cross-validation method to evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

#normalizing our training set values (helps our data to not be so skewed)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

#define model
model = RidgeCV(alphas=arange(0.001, 1.0, 0.001), cv=cv)
#put values and labels into a csv file to look at
X_train.to_csv('training_X.csv', index=False)
y_train.to_csv('training_Y.csv', index=False)

#fit model (this line is what takes so long to run)
model.fit(X_train_scaled, y_train)


In [None]:

#the lambda that produced the lowest test MSE
print(f'alpha: {model.alpha_}')

#Creating our test set to perform validation
X_test = test_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)
#fill in blank cells
#X_test = X_test.fillna(0)

X_test.dropna()

#normalizing test set
scaler = StandardScaler()
X_train_test = scaler.fit_transform(X_test)

y_test = test_set["percentage_laid_off"]
y_predict = model.predict(X_train_test)

X_test.to_csv('test_X.csv',index=False)
y_test.to_csv('test_Y.csv',index=False)

y_predict_df = pd.DataFrame(y_predict, columns=['percentage_laid_off'])
score = r2_score(y_test, y_predict)
print(f'score: {score}')




alpha: 0.999
score: 0.30480895857786716


In [None]:

print("======================")
print("Predictions:")
print(y_predict_df.head(20))
print("======================")


Predictions:
    percentage_laid_off
0              0.385498
1              0.247519
2              0.240291
3              0.394545
4              0.649684
5              0.161085
6              0.263073
7              0.240121
8              0.317366
9              0.247451
10             0.187187
11             0.118454
12             0.171499
13             0.155730
14             0.227644
15             0.218486
16             0.095426
17             0.478469
18             0.092234
19             0.244628


In [101]:
print("======================")
print("Actual:")
print(y_test.reset_index(drop=True).head(20))
print("======================")

Actual:
0     0.40
1     0.25
2     0.20
3     0.17
4     1.00
5     0.40
6     0.10
7     0.20
8     0.20
9     0.45
10    0.16
11    0.02
12    0.25
13    0.20
14    0.14
15    0.08
16    0.03
17    0.11
18    0.08
19    0.14
Name: percentage_laid_off, dtype: float64
