In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd

# Load data
df = pd.read_csv('us_tech_companies.csv')

# Features and target
x = df[['Company Name', 'Industry', 'Sector', 'HQ State', 'Founding Year',
       'Annual Revenue 2022-2023 (USD in Billions)',
       'Market Cap (USD in Trillions)', 'Stock Name', 'Employee Size']]

y = df['Annual Income Tax in 2022-2023 (USD in Billions)']

# Split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

# Identify column types
categorical_cols = ['Company Name', 'Industry', 'Sector', 'HQ State', 'Stock Name']
numeric_cols = ['Founding Year',
                'Annual Revenue 2022-2023 (USD in Billions)',
                'Market Cap (USD in Trillions)',
                'Employee Size']

# Preprocessing
preprocess = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# Pipeline
pipe = Pipeline([
    ('preprocess', preprocess),
    ('pca', PCA(n_components=10)),
    ('forest', RandomForestRegressor())
])

# Fit
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("R2 Score:", r2)


MAE: 0.299348
MSE: 0.14159499096
R2 Score: 0.09816680020554391


In [9]:
df.columns

Index(['Company Name', 'Industry', 'Sector', 'HQ State', 'Founding Year',
       'Annual Revenue 2022-2023 (USD in Billions)',
       'Market Cap (USD in Trillions)', 'Stock Name',
       'Annual Income Tax in 2022-2023 (USD in Billions)', 'Employee Size'],
      dtype='object')