Here I'm going to show some starter code for visualization of IT companies by countries.

Starting by installation of additional libraries (for plotting and for an attempt to gain statistics by cities).

In [None]:
!pip install geotext mapclassify

In [None]:
import numpy as np 
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

import os
import geotext # extracting "cities"

Observing and cleaning the data:

In [None]:
df_companies = pd.read_csv("../input/free-7-million-company-dataset/companies_sorted.csv")

In [None]:
df_companies.head()

In [None]:
df_companies.dropna(subset=['name', 'industry', 'country'], inplace=True)
df_companies['locality'].fillna('missing',inplace = True)

print('Column-wise distribution of null values in the dataset')
print(df_companies.isnull().sum())

We are only interested in companies, where at least one current employee is presented:

In [None]:
df_companies = df_companies[df_companies['current employee estimate'] > 0]

Capitalizing locations for further work (and especially for geotext):

In [None]:
df_companies.locality = df_companies.locality.str.title()
df_companies.country = df_companies.country.str.title()

Extracting the smallest part of location, that is a city at least sometimes, so I save it as 'city':

In [None]:
def get_city(x: str) -> str:
    geo = geotext.GeoText(x).cities
    if len(geo) > 0:
        return geo[0]
    return x.split(',')[0] # first part of the text as a possible city

df_companies['city'] = df_companies.locality.map(get_city)
df_companies['country_city'] = df_companies['country'] + '; ' + df_companies['city']

Getting full list of industries and manually selecting those related to IT (my selection is not very critical and is subjective):

In [None]:
sorted(set(df_companies.industry.tolist()))

In [None]:
IT_industries = [
 'animation',
 'biotechnology',
 'computer & network security',
 'computer games',
 'computer hardware',
 'computer networking',
 'computer software',
 'consumer electronics',
 'defense & space',
 'e-learning',
 'industrial automation',
 'information services',
 'information technology and services',
 'internet',
 'mechanical or industrial engineering',
 'program development',
 'telecommunications',
 'wireless'
]

In [None]:
it_frame = df_companies[df_companies.industry.isin(IT_industries)]

Looking for top IT cities:

In [None]:
it_cities = it_frame[it_frame.city != 'Missing'].country_city.value_counts().sort_values(ascending=False)\
            .reset_index().rename(columns={'index':'country_city', 'country_city': 'count'})
it_cities.to_csv('count_of_it_companies_by_cities.csv')
it_cities.head(10)

And finally playing with countries:

In [None]:
it_counted = it_frame[['country', 'name']].groupby(['country',], as_index=False).count().rename(columns={'name':'count'})
it_counted.sort_values(by='count', ascending=False).reset_index(drop=True).head(10)

Getting Natural Earth maps' database and adapting for our data:

In [None]:
gdf = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))[['name', 'geometry']]
gdf.columns = ['country', 'geometry']
print(gdf.head())

antarctic_index = int(gdf[gdf['country'] == 'Antarctica'].index.values)
gdf = gdf.drop(gdf.index[antarctic_index])

replace = {
    "United States of America": 'United States',
    "Bosnia and Herz.": 'Bosnia And Herzegovina',
    "Central African Rep.": 'Central African Republic',
    "Côte d'Ivoire": 'Côte D’Ivoire',
    "Dominican Rep.": 'Dominican Republic',
    "Dem. Rep. Congo": 'Democratic Republic Of The Congo', # 'Congo' remains vague, sorry
    "Eq. Guinea": 'Equatorial Guinea',
    "Dominican Rep.": 'Dominican Republic',
    "Solomon Is.": 'Solomon Islands',
    "Trinidad and Tobago": 'Trinidad And Tobago',
    "S. Sudan": 'South Sudan',
}

for (n_old, n_new) in replace.items():  
    gdf.country.loc[gdf['country'] == n_old] = n_new

it_countries = set(it_frame.country.tolist())
print("Countries, that are either missed in any of datasets, or named differently:")
print(sorted(it_countries.symmetric_difference(set(gdf.country))))

In [None]:
merged = gdf.merge(it_counted, left_on='country', right_on='country', how = 'left')
merged.head()

Finally the picture!

In [None]:
fig, ax = plt.subplots(1, figsize=(20,15))

merged.plot(color='grey', ax=ax, label = 'No data')
merged.dropna().plot(column='count', cmap='rainbow', ax=ax, scheme='natural_breaks', k=7, legend=True, legend_kwds={'loc': 'lower left'})

plt.title('Count of IT companies by countries')

#fixing floating-point counts, making them integer:
leg = ax.get_legend()
for text in leg.get_texts():
    text.set_text(text.get_text().replace('.00', ''))

plt.axis('off')

plt.savefig('it_by_countries.png', bbox_inches='tight')