In [None]:
import numpy as np
import pandas as pd
import openpyxl

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor



In [None]:
df = pd.read_excel('Largest Companies in the World.xlsx')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df1 = df.copy()

# EDA

In [None]:
df1.isnull().sum()

In [None]:
df1.rename(columns = {"Global Rank":"rank","Sales ($billion)":"Sales_bill","Profits ($billion)":"profits_bill","Assets ($billion)":"assets_bill","Market Value ($billion)":"market_value_bill"},inplace =True)
df1.columns

In [None]:
df1[['rank','Company', 'market_value_bill','Country']].loc[(df1['rank']>= 1) &(df1['rank']<= 15)]

In [None]:
df1[df1['rank'].duplicated() == True]

In [None]:
[df1[df1['rank'].duplicated() == True].count()]

In [None]:
df1.isna().sum()

In [None]:
#List of top 25 companies with highest number of companies in the dataset

df1['Country'].value_counts()[0:25]

# Visualisation

In [None]:
# Finding correlation in nominal features

sns.heatmap(df1[['Sales_bill', 'profits_bill', 'assets_bill','market_value_bill']].corr(),annot=True)

In [None]:
# Finding distribution in nominal features

sns.pairplot(df1[['Sales_bill', 'profits_bill', 'assets_bill','market_value_bill']])
plt.show()

The data is not normally distributed.

# Preprocessing

In [None]:
#Drop rank as we will be determining the rank of the company as per our assessment. Also dropping the name of the companies.

def preprocess_inputs(df1):
    
    df1 = df1.copy()
    
    #Drop unused columns
    df1 =df1.drop(['rank','Company','Latitude','Longitude'],axis=1)
    
    # One-hot encode nominal feature columns
    for column in ['Country', 'Continent']:
        dummies = pd.get_dummies(df1[column])
        df1 = pd.concat([df1, dummies], axis=1)
        df1 = df1.drop(column, axis=1)
        
    # Split df into X and y
    y = df1['market_value_bill']
    X = df1.drop('market_value_bill', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=150)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test


preprocess_inputs(df1)

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(df1)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                          XGBRegressor": XGBRegressor(),
    "                      LightGBRegressor": LGBMRegressor(),
    "                     CatBoostRegressor": CatBoostRegressor()
    
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    r2 = model.score(X_test, y_test)
    print(name +  " R^2: {:.4f}".format(r2))

Using Pycaret module to validate the model selection

In [None]:
from pycaret.regression import *

In [None]:
df2 = df.copy()

In [None]:
s = setup(df2, target = df2['Market Value ($billion)'], session_id = 123)

In [None]:
# check all available config
get_config()

In [None]:
# compare baseline models
best = compare_models()

In [None]:
# plot residuals
plot_model(best, plot = 'residuals')

In [None]:
# plot error
plot_model(best, plot = 'error')

In [None]:
# plot feature importance
plot_model(best, plot = 'feature')

# Prediction

In [None]:
# predict on test set
holdout_pred = predict_model(best)

In [None]:
# show predictions df
holdout_pred.head()