In [1]:
import pandas as pd
import numpy as np
from faker import Faker

In [2]:
# Initialize Faker to generate fake data
fake = Faker()

# Define the number of rows for the CSV
num_rows = 150

# List to hold the generated data
data = []

# Possible values for categorical columns
company_types = ['Technology', 'Finance', 'Healthcare', 'Retail', 'Manufacturing', 'Logistics', 'Energy']
booleans = [True, False]

# Generate the data for each row
for i in range(num_rows):
    company_id = i + 1
    company_name = fake.company()
    company_type = np.random.choice(company_types)
    employee_count = np.random.randint(50, 5000)
    url = fake.url()
    is_public = np.random.choice(booleans)
    is_profitable = np.random.choice(booleans)
    
    data.append([
        company_id,
        company_name,
        company_type,
        employee_count,
        url,
        is_public,
        is_profitable
    ])

# Create a DataFrame from the generated data
columns = [
    'company id',
    'Company',
    'Type of company',
    'employee count',
    'url',
    'is_public',
    'is_profitable'
]
df = pd.DataFrame(data, columns=columns)

# Export the DataFrame to a CSV file
#df.to_csv('sample_companies.csv', index=False)

#print(f"Successfully generated 'sample_companies.csv' with {num_rows} rows.")

In [3]:
print("The first 5 rows of the DataFrame:")
df.head()

The first 5 rows of the DataFrame:


Unnamed: 0,company id,Company,Type of company,employee count,url,is_public,is_profitable
0,1,"Anderson, Roberts and Brown",Technology,3810,https://www.rivera.com/,False,True
1,2,"Patterson, Valencia and Cole",Energy,1264,https://www.benitez-smith.com/,True,False
2,3,Gallagher-Campbell,Finance,799,https://www.washington.com/,True,False
3,4,"Morgan, Bartlett and Thomas",Retail,4581,https://www.mccall.com/,False,True
4,5,Nichols-Thomas,Healthcare,4262,https://long.com/,True,False


In [4]:
df.tail()

Unnamed: 0,company id,Company,Type of company,employee count,url,is_public,is_profitable
145,146,"Elliott, Caldwell and Johnson",Technology,430,https://www.williams-merritt.info/,True,False
146,147,Fritz PLC,Energy,4516,http://www.jenkins.info/,True,True
147,148,Rodriguez Inc,Manufacturing,4771,http://www.stewart.com/,False,False
148,149,"Lynch, Anderson and Walker",Finance,3302,https://www.williams-jenkins.com/,False,True
149,150,"Sanchez, Miller and Baldwin",Healthcare,3309,https://www.gilbert.net/,True,True


In [5]:
df.describe()

Unnamed: 0,company id,employee count
count,150.0,150.0
mean,75.5,2625.893333
std,43.445368,1352.349004
min,1.0,53.0
25%,38.25,1419.0
50%,75.5,2870.0
75%,112.75,3675.25
max,150.0,4881.0


In [6]:
print(df.shape)
print(df.dtypes)

(150, 7)
company id          int64
Company            object
Type of company    object
employee count      int64
url                object
is_public            bool
is_profitable        bool
dtype: object


In [7]:
df['Type of company'].value_counts()

Type of company
Energy           41
Manufacturing    22
Healthcare       20
Finance          18
Retail           18
Logistics        16
Technology       15
Name: count, dtype: int64

In [8]:
df.nunique()

company id         150
Company            149
Type of company      7
employee count     149
url                148
is_public            2
is_profitable        2
dtype: int64

In [9]:
df.isnull().sum()
#df.dropna()
#df.fillna()

company id         0
Company            0
Type of company    0
employee count     0
url                0
is_public          0
is_profitable      0
dtype: int64

In [10]:
df.duplicated().sum()
#df.drop_duplicates()

np.int64(0)

In [11]:
#number of employees by type of company
df.groupby('Type of company', sort=True)['employee count'].mean()

Type of company
Energy           2602.780488
Finance          2484.444444
Healthcare       2937.700000
Logistics        2261.875000
Manufacturing    2570.772727
Retail           3034.444444
Technology       2421.933333
Name: employee count, dtype: float64

In [12]:
df[df['employee count']>4900]

Unnamed: 0,company id,Company,Type of company,employee count,url,is_public,is_profitable


In [13]:
df['is_profitable_string'] = df['is_profitable'].replace({True:'Yes',False:'No'})
df['is_profitable_string']

0      Yes
1       No
2       No
3      Yes
4       No
      ... 
145     No
146    Yes
147     No
148    Yes
149    Yes
Name: is_profitable_string, Length: 150, dtype: object

In [14]:
#check if any value is non-zero or non empty. False if empty or zero
df.any()

company id              True
Company                 True
Type of company         True
employee count          True
url                     True
is_public               True
is_profitable           True
is_profitable_string    True
dtype: bool

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   company id            150 non-null    int64 
 1   Company               150 non-null    object
 2   Type of company       150 non-null    object
 3   employee count        150 non-null    int64 
 4   url                   150 non-null    object
 5   is_public             150 non-null    bool  
 6   is_profitable         150 non-null    bool  
 7   is_profitable_string  150 non-null    object
dtypes: bool(2), int64(2), object(4)
memory usage: 7.5+ KB


In [16]:
df.loc[df["Type of company"] == 'Technology', "employee count"].agg(("max", "min"))

max    4865
min     152
Name: employee count, dtype: int64

In [17]:
df.axes

[RangeIndex(start=0, stop=150, step=1),
 Index(['company id', 'Company', 'Type of company', 'employee count', 'url',
        'is_public', 'is_profitable', 'is_profitable_string'],
       dtype='object')]