In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal

In [2]:
# Retrieve the cleaned data frame from 2
df = pd.read_csv('fortune_2000_in_2021_cleaned.csv')
df

Unnamed: 0,Rank,Name,Country,Sales,Profit,Assets,Market Value
0,1,ICBC,China,$190.5 B,$45.8 B,"$4,914.7 B",$249.5 B
1,2,JPMorgan Chase,United States,$136.2 B,$40.4 B,"$3,689.3 B",$464.8 B
2,3,Berkshire Hathaway,United States,$245.5 B,$42.5 B,$873.7 B,$624.4 B
3,4,China Construction Bank,China,$173.5 B,$39.3 B,"$4,301.7 B",$210.4 B
4,5,Saudi Arabian Oil Company (Saudi Aramco),Saudi Arabia,$229.7 B,$49.3 B,$510.3 B,"$1,897.2 B"
...,...,...,...,...,...,...,...
1995,1996,JTEKT,Japan,$13.1 B,$-34.9 M,$12 B,$3.5 B
1996,1997,Insurance Australia Group,Australia,$5.4 B,$-212 M,$25 B,$9.3 B
1997,1997,Sumec Corporation,China,$13 B,$79.2 M,$7.1 B,$1.3 B
1998,1999,Gold Fields,South Africa,$3.9 B,$718.1 M,$7.5 B,$8.9 B


# Create function to convert numerical string to number 
### The function takes a string in millions or billions and converts it to a number in millions

In [3]:
# Takes a string, removes dollar sign, converts to a float and into millions
def return_total_in_millions(total):
    # split string to a list
    total_as_list = total.split(" ")
    
    # remove dollar sign or any commas from total and convert to Decimal
    total_without_dollar_sign = total_as_list[0].replace('$','')
    total_without_dollar_sign = Decimal(total_without_dollar_sign.replace(',',''))
    
    # get 'M' or 'B' from list
    m_or_b = total_as_list[1]
    
    # If B - multiple by 1000
    if m_or_b == 'B':
        return total_without_dollar_sign * 1000
    else:
        return total_without_dollar_sign

In [4]:
# Quick function test
return_total_in_millions('$4,914.7 B')

Decimal('4914700.0')

- Here we'll create a test dataframe and test creating a new column based on an existing column with the 'return_total_in_millions' function.

In [5]:
df_test = pd.DataFrame(
    {
        "Total":["$26.2 B","$32.5 M","-$92.4 B","-182 M"]
    }
)

In [6]:
df_test

Unnamed: 0,Total
0,$26.2 B
1,$32.5 M
2,-$92.4 B
3,-182 M


In [7]:
df_test['Total (in millions)'] = df_test['Total'].apply(return_total_in_millions)

In [8]:
df_test

Unnamed: 0,Total,Total (in millions)
0,$26.2 B,26200.0
1,$32.5 M,32.5
2,-$92.4 B,-92400.0
3,-182 M,-182.0


# Create new columns -  Profit, Assets and Market Value
- Create a new (in millions) column from the existing columns:
- Sales, Profit,Assets,Market Value

In [9]:
df['Sales (in millions)'] = df['Sales'].apply(return_total_in_millions)

In [10]:
df['Profit (in millions)'] = df['Profit'].apply(return_total_in_millions)

In [11]:
df['Assets (in millions)'] = df['Assets'].apply(return_total_in_millions)

In [12]:
df['Market Value (in millions)'] = df['Market Value'].apply(return_total_in_millions)

In [13]:
df

Unnamed: 0,Rank,Name,Country,Sales,Profit,Assets,Market Value,Sales (in millions),Profit (in millions),Assets (in millions),Market Value (in millions)
0,1,ICBC,China,$190.5 B,$45.8 B,"$4,914.7 B",$249.5 B,190500.0,45800.0,4914700.0,249500.0
1,2,JPMorgan Chase,United States,$136.2 B,$40.4 B,"$3,689.3 B",$464.8 B,136200.0,40400.0,3689300.0,464800.0
2,3,Berkshire Hathaway,United States,$245.5 B,$42.5 B,$873.7 B,$624.4 B,245500.0,42500.0,873700.0,624400.0
3,4,China Construction Bank,China,$173.5 B,$39.3 B,"$4,301.7 B",$210.4 B,173500.0,39300.0,4301700.0,210400.0
4,5,Saudi Arabian Oil Company (Saudi Aramco),Saudi Arabia,$229.7 B,$49.3 B,$510.3 B,"$1,897.2 B",229700.0,49300.0,510300.0,1897200.0
...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,JTEKT,Japan,$13.1 B,$-34.9 M,$12 B,$3.5 B,13100.0,-34.9,12000,3500.0
1996,1997,Insurance Australia Group,Australia,$5.4 B,$-212 M,$25 B,$9.3 B,5400.0,-212,25000,9300.0
1997,1997,Sumec Corporation,China,$13 B,$79.2 M,$7.1 B,$1.3 B,13000,79.2,7100.0,1300.0
1998,1999,Gold Fields,South Africa,$3.9 B,$718.1 M,$7.5 B,$8.9 B,3900.0,718.1,7500.0,8900.0


# Randomly test rows from the data frame

In [14]:
# Get a random list of rows and compare 'Sales' to 'Sales (in millions)'
rand_nums = np.random.randint(0,1999, size=10)
pd.DataFrame(df.loc[rand_nums])

Unnamed: 0,Rank,Name,Country,Sales,Profit,Assets,Market Value,Sales (in millions),Profit (in millions),Assets (in millions),Market Value (in millions)
1054,1054,Leidos,United States,$12.3 B,$628 M,$12.5 B,$14.4 B,12300.0,628.0,12500.0,14400.0
727,728,Samsung Fire & Marine,South Korea,$18 B,$639.9 M,$84.4 B,$7.3 B,18000.0,639.9,84400.0,7300.0
1770,1770,Sartorius,Germany,$2.7 B,$258.5 M,$5.8 B,$38.5 B,2700.0,258.5,5800.0,38500.0
1423,1424,SinoPac Financial,Taiwan,$1.8 B,$415.7 M,$76.3 B,$5.2 B,1800.0,415.7,76300.0,5200.0
1821,1822,JD Sports Fashion,United Kingdom,$7.9 B,$288.5 M,$7.1 B,$13.3 B,7900.0,288.5,7100.0,13300.0
1175,1176,Schroders,United States,$3.3 B,$610.8 M,$29.6 B,$13.7 B,3300.0,610.8,29600.0,13700.0
1534,1535,Jointown Pharmaceutical Group,China,$15.2 B,$406.6 M,$11.6 B,$5 B,15200.0,406.6,11600.0,5000.0
261,262,Tesla,United States,$31.5 B,$690 M,$52.1 B,$710.1 B,31500.0,690.0,52100.0,710100.0
157,158,L'Oréal,France,$31.9 B,$4.1 B,$53.4 B,$225.7 B,31900.0,4100.0,53400.0,225700.0
1822,1823,A2A,Italy,$7.6 B,$415 M,$15 B,$5.8 B,7600.0,415.0,15000.0,5800.0


# Save the dataframe

In [15]:
df.to_csv('fortune_2000_in_2021_cleaned_transformed.csv', index=False)