In [4]:
# !pip install openpyxl

In [10]:
import pandas as pd
import numpy as np
import openpyxl

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

In [6]:
data = pd.read_excel('../input/largest-2000-companies-in-the-world-by-revenue/Largest Companies in the World.xlsx')
data

Unnamed: 0,Global Rank,Company,Sales ($billion),Profits ($billion),Assets ($billion),Market Value ($billion),Country,Continent,Latitude,Longitude
0,1.0,ICBC,134.8,37.8,2813.5,237.3,China,Asia,35.861660,104.195397
1,2.0,China Construction Bank,113.1,30.6,2241.0,202.0,China,Asia,35.861660,104.195397
2,3.0,JPMorgan Chase,108.2,21.3,2359.1,191.4,USA,North America,37.090240,-95.712891
3,4.0,General Electric,147.4,13.6,685.3,243.7,USA,North America,37.090240,-95.712891
4,5.0,Exxon Mobil,420.7,44.9,333.8,400.4,USA,North America,37.090240,-95.712891
...,...,...,...,...,...,...,...,...,...,...
1919,1995.0,Tractor Supply,4.7,0.3,1.7,7.1,USA,North America,37.090240,-95.712891
1920,1996.0,San-Ai Oil,0.5,0.1,25.7,0.5,Japan,Asia,36.204824,138.252924
1921,1996.0,UOL Group,0.9,0.7,7.8,4.2,Singapore,Asia,1.352083,103.819836
1922,1998.0,Interconexion Electrica,2.4,0.2,14.6,5.8,Colombia,South America,4.570868,-74.297333


In [7]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop unused columns
    df = df.drop(['Global Rank', 'Company'], axis=1)
    
    # One-hot encode nominal feature columns
    for column in ['Country', 'Continent']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    # Split df into X and y
    y = df['Market Value ($billion)']
    X = df.drop('Market Value ($billion)', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [11]:
inputs = tf.keras.Input(shape=(71,))
x = tf.keras.layers.Dense(128, activation='relu')(inputs)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss='mse'
)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

2022-12-14 09:29:35.261971: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-12-14 09:29:35.804374: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


In [19]:
y_pred = model.predict(X_test)
y_pred
y_pred.shape

(578, 1)

In [18]:
y_pred = np.squeeze(model.predict(X_test))
y_pred
y_pred.shape

(578,)

In [20]:
y_pred = np.squeeze(model.predict(X_test))

rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

print("RMSE: {:.2f}".format(rmse))
print(" R^2: {:.4f}".format(r2))

fig = px.scatter(
    x=y_pred,
    y=y_test,
    labels={'x': "Predicted", 'y': "Actual"},
    title="Actual vs. Predicted Values",
    width=700,
    height=700
)

fig.show()

RMSE: 17.42
 R^2: 0.7264
