'''About Dataset
🏆 100 AI Companies in 2024
This dataset was scraped from Datamation(🔗Link), containing insights into top AI firms.

Columns

🏢 Company Name – Name of the company
📍 Headquarters – Location of HQ
📅 Founded – Year of establishment
💰 Annual Revenue – Reported revenue
⭐ Glassdoor Score – Employee rating

Possible Uses
🔹 Data Cleaning (removing symbols, normalizing data)
🔹 Predictive Analysis (estimating revenue trends)
🔹 AI Industry Insights
'''

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('Ai_companies.csv')

In [3]:
df.head()

Unnamed: 0,Company Name,Description,Headquarters,Founded,Annual Revenue,Glassdoor Score
0,Alibaba Cloud,Best for Big Data Analytics,"Hangzhou, China",2009,$479.5 million,3.7/5
1,DataRobot,Best for Automated Machine Learning,"Boston, Massachusetts",2012,$338.2 million,3.7/5
2,Google,Best for AI Development,"Mountain View, California",1998,$305.6 billion,4.4/5
3,Hugging Face,Best for Deploying AI Models in the Cloud,"Brooklyn, New York",2016,$40 million,4.3/5
4,H2O.ai,Best for Time Series Forecasting,"Mountain View, California",2011,$69.2 million,3.1/5


In [4]:
df.shape

(100, 6)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Company Name     100 non-null    object
 1   Description      100 non-null    object
 2   Headquarters     100 non-null    object
 3   Founded          100 non-null    int64 
 4   Annual Revenue   100 non-null    object
 5   Glassdoor Score  98 non-null     object
dtypes: int64(1), object(5)
memory usage: 4.8+ KB


In [11]:
df['Company Age'] = 2025 - df['Founded']

In [12]:
df.head()

Unnamed: 0,Company Name,Description,Headquarters,Founded,Annual Revenue,Glassdoor Score,Company Age
0,Alibaba Cloud,Best for Big Data Analytics,"Hangzhou, China",2009,$479.5 million,3.7/5,16
1,DataRobot,Best for Automated Machine Learning,"Boston, Massachusetts",2012,$338.2 million,3.7/5,13
2,Google,Best for AI Development,"Mountain View, California",1998,$305.6 billion,4.4/5,27
3,Hugging Face,Best for Deploying AI Models in the Cloud,"Brooklyn, New York",2016,$40 million,4.3/5,9
4,H2O.ai,Best for Time Series Forecasting,"Mountain View, California",2011,$69.2 million,3.1/5,14


In [15]:
df['Glassdoor Score'] = df['Glassdoor Score'].str.replace('/5',"")

In [16]:
df.head()

Unnamed: 0,Company Name,Description,Headquarters,Founded,Annual Revenue,Glassdoor Score,Company Age
0,Alibaba Cloud,Best for Big Data Analytics,"Hangzhou, China",2009,$479.5 million,3.7,16
1,DataRobot,Best for Automated Machine Learning,"Boston, Massachusetts",2012,$338.2 million,3.7,13
2,Google,Best for AI Development,"Mountain View, California",1998,$305.6 billion,4.4,27
3,Hugging Face,Best for Deploying AI Models in the Cloud,"Brooklyn, New York",2016,$40 million,4.3,9
4,H2O.ai,Best for Time Series Forecasting,"Mountain View, California",2011,$69.2 million,3.1,14


In [17]:
df['Glassdoor Score'].unique()

array(['3.7', '4.4', '4.3', '3.1', '3.9', nan, '3.4', '3.8', '4.1', '3.6',
       '4.5', '3.3', '4.8', '3.2', '5-Apr', '4.6', '4.7', '3.5', '4.2',
       '4.9', '2.9'], dtype=object)

In [18]:
df['Glassdoor Score'] = df['Glassdoor Score'].replace('5-Apr',5)

In [19]:
df['Glassdoor Score'].unique()

array(['3.7', '4.4', '4.3', '3.1', '3.9', nan, '3.4', '3.8', '4.1', '3.6',
       '4.5', '3.3', '4.8', '3.2', 5, '4.6', '4.7', '3.5', '4.2', '4.9',
       '2.9'], dtype=object)

In [22]:
df = df.dropna().reset_index(drop=True)

In [23]:
df['Glassdoor Score'].unique()

array(['3.7', '4.4', '4.3', '3.1', '3.9', '3.4', '3.8', '4.1', '3.6',
       '4.5', '3.3', '4.8', '3.2', 5, '4.6', '4.7', '3.5', '4.2', '4.9',
       '2.9'], dtype=object)

In [24]:
df.shape

(98, 7)

In [25]:
# dealing with the Annual Revenue column

In [26]:
df.head()

Unnamed: 0,Company Name,Description,Headquarters,Founded,Annual Revenue,Glassdoor Score,Company Age
0,Alibaba Cloud,Best for Big Data Analytics,"Hangzhou, China",2009,$479.5 million,3.7,16
1,DataRobot,Best for Automated Machine Learning,"Boston, Massachusetts",2012,$338.2 million,3.7,13
2,Google,Best for AI Development,"Mountain View, California",1998,$305.6 billion,4.4,27
3,Hugging Face,Best for Deploying AI Models in the Cloud,"Brooklyn, New York",2016,$40 million,4.3,9
4,H2O.ai,Best for Time Series Forecasting,"Mountain View, California",2011,$69.2 million,3.1,14


In [27]:
df['Annual Revenue'] = df['Annual Revenue'].replace('$','')

In [34]:

pd.reset_option("display.max_rows")  # Reset to default



In [35]:
df

Unnamed: 0,Company Name,Description,Headquarters,Founded,Annual Revenue,Glassdoor Score,Company Age
0,Alibaba Cloud,Best for Big Data Analytics,"Hangzhou, China",2009,$479.5 million,3.7,16
1,DataRobot,Best for Automated Machine Learning,"Boston, Massachusetts",2012,$338.2 million,3.7,13
2,Google,Best for AI Development,"Mountain View, California",1998,$305.6 billion,4.4,27
3,Hugging Face,Best for Deploying AI Models in the Cloud,"Brooklyn, New York",2016,$40 million,4.3,9
4,H2O.ai,Best for Time Series Forecasting,"Mountain View, California",2011,$69.2 million,3.1,14
...,...,...,...,...,...,...,...
93,Nuro,Best for Driverless Operation,"Mountain View, California",2016,$438.8 million,3.8,9
94,Pony.ai,Best for Robotruck Development,"Fremont, California",2016,$100 million,3.4,9
95,Tesla,Best for Electric Vehicles,"Austin, Texas",2003,$96.77 billion,3.6,22
96,Waymo,Best for Experience and Testing,"Mountain View, California",2016,$1.4 billion,3.7,9
