# Data Analysis of Layoffs in the United States Tech Industry 

## Project Motivation and Background

As Computer Science students about to enter the job market, we're concerned about the volatility of the tech industry. We want to analyze and create a system that can help people understand the markets, plan an exit strategy, and alleviate these concerns.

## Project Goal:
The goal of our project is to analyze trends in companies' recent layoffs in a variety of industries (aerospace, travel, retail, etc.) and detect patterns and trends.  This will be done by looking at the number of employees laid off, the location of the companies, their stages, and the funds they have raised.




In [63]:
import pandas as pd
import numpy as np

# Read in the data
layoffs = pd.read_csv('layoffs.csv')
layoffs.head()
print(layoffs.columns)
#we have to drop all rows with a blank percentage layed off cell
layoffs.dropna(subset=['percentage_laid_off'], inplace=True)


# one hot encoding for categorical variables 
print(f"Unique values for 'company': {len(layoffs['company'].unique())}")

# Adding dummy variables for location, industry, stage, and country
totalNewCols = len(layoffs['location'].unique()) + len(layoffs['industry'].unique()) + len(layoffs['stage'].unique()) + len(layoffs['country'].unique())
print(f"Total number of new columns: {totalNewCols}")

#loca = pd.get_dummies(layoffs['location'], prefix='location')
indu = pd.get_dummies(layoffs['industry'], prefix='industry')
stag = pd.get_dummies(layoffs['stage'], prefix='stage')
#coun = pd.get_dummies(layoffs['country'], prefix='country')

# drop the original columns
layoffs.drop(['location', 'industry', 'stage', 'country'], axis=1, inplace=True)

# concat the new columns
#layoffs = pd.concat([layoffs, loca, indu, stag, coun], axis=1)
layoffs = pd.concat([layoffs, indu, stag], axis=1)
layoffs.drop(['stage_Unknown'],axis=1,inplace=True)

layoffs.head()


Index(['company', 'location', 'industry', 'total_laid_off',
       'percentage_laid_off', 'date', 'stage', 'country', 'funds_raised'],
      dtype='object')
Unique values for 'company': 1438
Total number of new columns: 256


Unnamed: 0,company,total_laid_off,percentage_laid_off,date,funds_raised,industry_Aerospace,industry_Construction,industry_Consumer,industry_Crypto,industry_Data,...,stage_Series B,stage_Series C,stage_Series D,stage_Series E,stage_Series F,stage_Series G,stage_Series H,stage_Series I,stage_Series J,stage_Subsidiary
0,N26,71.0,0.04,2023-04-28,1700.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,Providoor,,1.0,2023-04-28,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Dropbox,500.0,0.16,2023-04-27,1700.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Vroom,120.0,0.11,2023-04-27,1300.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Greenhouse,100.0,0.12,2023-04-27,110.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
traing_data = layoffs.drop(['company', 'date'], axis=1)

# Split the data into training and testing sets
train_set = traing_data.sample(frac=0.8, random_state=0)
test_set = traing_data.drop(train_set.index)

print (f"Training set shape: {train_set.shape}")
print (f"Testing set shape: {test_set.shape}")

Training set shape: (1355, 47)
Testing set shape: (339, 47)


In [65]:
from sklearn.linear_model import ridge_regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import StandardScaler
from numpy import arange


# print(f'train cols: {train_set.columns}')
# establish training set
#values for our training set
X_train = train_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)

#fill in blank cells
X_train = X_train.fillna(0)

#labels for our training set
y_train = train_set["percentage_laid_off"]

# establish test set
X_test = test_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)
print(X_test.columns)
X_test = X_test.fillna(0)
y_test = test_set['percentage_laid_off']


# more riley code
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#define cross-validation method to evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

#define model
#model = RidgeCV(alphas=arange(0.000001, 1.000001, 0.01), cv=cv)
model = RidgeCV(alphas=arange(0.000001, 1.000001, 0.01))

#put values and labels into a csv file to look at
#TODO: UNCOMMENT THESE
# X.to_csv('training_X.csv', index=False)
# y.to_csv('training_Y.csv', index=False)

#fit model
model.fit(X_train, y_train)
score = model.score(X_test,y_test)

#the lambda that produced the lowest test MSE
print(f'alpha: {model.alpha_}')
print(f'score: {score}')

Index(['funds_raised', 'industry_Aerospace', 'industry_Construction',
       'industry_Consumer', 'industry_Crypto', 'industry_Data',
       'industry_Education', 'industry_Energy', 'industry_Finance',
       'industry_Fitness', 'industry_Food', 'industry_HR', 'industry_Hardware',
       'industry_Healthcare', 'industry_Infrastructure', 'industry_Legal',
       'industry_Logistics', 'industry_Manufacturing', 'industry_Marketing',
       'industry_Media', 'industry_Other', 'industry_Product',
       'industry_Real Estate', 'industry_Recruiting', 'industry_Retail',
       'industry_Sales', 'industry_Security', 'industry_Support',
       'industry_Transportation', 'industry_Travel', 'stage_Acquired',
       'stage_Post-IPO', 'stage_Private Equity', 'stage_Seed',
       'stage_Series A', 'stage_Series B', 'stage_Series C', 'stage_Series D',
       'stage_Series E', 'stage_Series F', 'stage_Series G', 'stage_Series H',
       'stage_Series I', 'stage_Series J', 'stage_Subsidiary'],
      dt

In [None]:
import matplotlib.pyplot as plt

# X_test = test_set.drop(['percentage_laid_off', 'total_laid_off'], axis=1)
# X_test = X_test.fillna(0)
# y_test = test_set["percentage_laid_off"]
# y_predict = model.predict(X_test)

# X_test.to_csv('test_X.csv',index=False)
# y_test.to_csv('test_Y.csv',index=False)

# y_predict_df = pd.DataFrame(y_predict, columns=['percentage_laid_off'])

# #print("xtest: ", X_test.head(5))




# print(y_test.size)
# print(arange(0,508,1).size)

# Plot outputs
plt.scatter(arange(0,508,1), y_test, color="black")
plt.plot(arange(0,508,1), y_predict, color="blue", linewidth=3)

plt.xticks(())
plt.yticks(())
#the output of this is really ugly dw about it
plt.show()



# RILEY CODE


In [None]:

print("======================")
print(y_predict_df.head(20))
print("======================")


In [None]:
print("======================")
print(y_test.reset_index(drop=True).head(20))
print("======================")