In [1]:
# Connecting the Python Code with the google drive
from google.colab import drive

In [2]:
import pandas as pd

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Projects/companies_by_revenue.csv')
df.describe()
df.head(10)

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,611289,6.7%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,513983,9.4%,1540000,"Seattle, Washington"
2,3,ExxonMobil,Petroleum industry,413680,44.8%,62000,"Spring, Texas"
3,4,Apple,Electronics industry,394328,7.8%,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,12.7%,400000,"Minnetonka, Minnesota"
5,6,CVS Health,Healthcare,322467,10.4%,259500,"Woonsocket, Rhode Island"
6,7,Berkshire Hathaway,Conglomerate,302089,9.4%,383000,"Omaha, Nebraska"
7,8,Alphabet,Technology and cloud computing,282836,9.8%,156000,"Mountain View, California"
8,9,McKesson Corporation,Health,276711,4.8%,48500,"Irving, Texas"
9,10,Chevron Corporation,Petroleum industry,246252,51.6%,43846,"San Ramon, California"


Step 2: Data Preprocessing

The dataset contains columns with numerical information stored as strings, possibly with commas and percentage signs. We need to convert these to a proper numeric format. We also notice non-numeric entries in the "Employees" column that need cleaning.

In [6]:
# Convert "Revenue (USD millions)" and "Revenue growth" to numeric formats
df['Revenue (USD millions)'] = df['Revenue (USD millions)'].str.replace(',', '').astype(float)
df['Revenue growth'] = df['Revenue growth'].str.rstrip('%').astype(float) / 100

# Clean and convert "Employees" column, removing non-numeric characters
df['Employees'] = df['Employees'].str.extract('(\d+)').astype(float)


Step 3: Feature Selection

For predicting the "Revenue (USD millions)", we decide to use "Industry", "Revenue growth", and "Employees" as features. "Industry" is categorical and needs encoding, while the other two are numeric.

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Extracting the features for the model
features = ['Industry', 'Revenue growth', 'Employees']
X = df[features]
y = df['Revenue (USD millions)']

# Preprocessing steps for the pipeline
# Numeric features will be scaled, and categorical features will be one-hot encoded
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Revenue growth', 'Employees']),  # Scaling numeric features
        ('cat', OneHotEncoder(), ['Industry'])  # Encoding categorical feature
    ])


Step 4: Model Training

We use a linear regression model as it's suitable for predicting a numeric value. We also incorporate preprocessing steps into a pipeline to streamline transformations and modeling.

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Preparing the data
X = df[['Industry', 'Revenue growth', 'Employees']]
y = df['Revenue (USD millions)']

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Revenue growth', 'Employees']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Industry'])
    ])

# Creating a modeling pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
pipeline.fit(X_train, y_train)


Step 5: Evaluation

After training the model, we evaluate its performance using metrics suitable for regression tasks, such as R-squared and Mean Squared Error (MSE).

In [12]:
from sklearn.metrics import r2_score, mean_squared_error

# Making predictions
y_pred = pipeline.predict(X_test)

# Evaluating the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'R-squared: {r2}')
print(f'Mean Squared Error: {mse}')


R-squared: -0.2155138410772357
Mean Squared Error: 20327523566.903248


Step 6: Prediction

With the model trained and evaluated, we can use it to make predictions on new data, assuming it's preprocessed in the same way as our training data.

In [14]:

new_data = pd.DataFrame({
    'Industry': ['Electronics industry', 'Healthcare'],
    'Revenue growth': [0.05, 0.12],  # 5% and 12% growth
    'Employees': [100000, 50000]
})

# Display the new data
print(new_data)


               Industry  Revenue growth  Employees
0  Electronics industry            0.05     100000
1            Healthcare            0.12      50000


In [15]:
# Making predictions with the model
new_predictions = pipeline.predict(new_data)

# Displaying the predictions
print(new_predictions)


[24613258.4653842  12267016.76841335]
