In [1]:
########################### Basic function ###########################

# Assumed no symbol has three or more letters
Symbols = [ 'Ac', 'Ag', 'Al', 'Am', 'Ar', 'As', 'At', 'Au', 'B', 'Ba', 'Be', 'Bh', 'Bi', 'Bk', 'Br',
           'C', 'Ca', 'Cd', 'Ce', 'Cf', 'Cl', 'Cm', 'Cn', 'Co', 'Cr', 'Cs', 'Cu', 'Db', 'Ds', 'Dy',
           'Er', 'Es', 'Eu', 'F', 'Fe', 'Fl', 'Fm', 'Fr', 'Ga', 'Gd', 'Ge', 'H', 'He', 'Hf', 'Hg',
           'Ho', 'Hs', 'I', 'In', 'Ir', 'K', 'Kr', 'La', 'Li', 'Lr', 'Lu', 'Lv', 'Mc', 'Md', 'Mg',
           'Mn', 'Mo', 'Mt', 'N', 'Na', 'Nb', 'Nd', 'Ne', 'Nh', 'Ni', 'No', 'Np', 'O', 'Og', 'Os',
           'P', 'Pa', 'Pb', 'Pd', 'Pm', 'Po', 'Pr', 'Pt', 'Pu', 'Ra', 'Rb', 'Re', 'Rf', 'Rg', 'Rh',
           'Rn', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Sg', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te',
           'Th', 'Ti', 'Tl', 'Tm', 'Ts', 'U', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr' ]


def chem(w):
    ''' Input: a string
    Output: a list of all possible ways to write w using the symbols.
    '''
    
    if len(w)==0:
        return []
    
    w1 = w[:1].capitalize()
    
    if len(w)==1:
        if w1 in Symbols:
            return [w1]
        return []
    
    w2 = w[1].capitalize()    # basis of recursion
    w12 = w[:2].capitalize()
    
    chem_w=[]
    
    if len(w)==2:
        if w1 in Symbols and w2 in Symbols: chem_w.append(w1+w2)    # two branched recursion
        if w12 in Symbols: chem_w.append(w12)
        return chem_w
    
    if w1 in Symbols:
        for x in chem(w[1:]):
            chem_w.append(w1+x)
            
    if w12 in Symbols:
        for x in chem(w[2:]):
            chem_w.append(w12+x) 
    
    return chem_w


chem("Hopkins")

['HOPKINS', 'HOPKInS', 'HoPKINS', 'HoPKInS']

In [2]:
########################### Reading from "english_surnames.txt"  ###########################

import pandas as pd

data = pd.read_csv('english_surnames.txt', sep='\t', usecols=[1,2], names=['Surname', 'Count'])


# The "Count" column consists of strings of digits with spaces. We need to convert it to integers.

data['Count'] = [int(x.replace(' ','')) for x in data['Count'].values]

data

Unnamed: 0,Surname,Count
0,Smith,729862
1,Jones,578261
2,Taylor,458268
3,Williams,411385
4,Brown,380443
...,...,...
8455,Longden,1509
8456,Fayers,1509
8457,Barren,1509
8458,Oddy,1508


In [4]:
# Total population in the table. The current population of UK is 66.65 million (2019).

data["Count"].sum()

55768712

In [5]:
# We add a new column with list of all ways to write the surname as a chem-word.
# And another column which just counts the elements in the previous column.

data["Chem-words"] = [chem(x) for x in data["Surname"]]
data["#Chem-words"] = [len(x) for x in data["Chem-words"]]

data

Unnamed: 0,Surname,Count,Chem-words,#Chem-words
0,Smith,729862,[SmITh],1
1,Jones,578261,[],0
2,Taylor,458268,[],0
3,Williams,411385,[],0
4,Brown,380443,[BrOWN],1
...,...,...,...,...
8455,Longden,1509,[],0
8456,Fayers,1509,[],0
8457,Barren,1509,[BArReN],1
8458,Oddy,1508,[],0


In [6]:
# The ratio of surnames which are chem-words:

data[ data["#Chem-words"]>0 ].shape[0] / data.shape[0]

0.18356973995271866

In [7]:
# The ratio of people in the population which have chem-surnames:

data[ data["#Chem-words"]>0 ]["Count"].sum() / data["Count"].sum()

0.20208130322249507

In [8]:
#The following surnames have the most variations:

nr_max = max(data["#Chem-words"])
data[ data["#Chem-words"] == nr_max ]



Unnamed: 0,Surname,Count,Chem-words,#Chem-words
882,Cousins,11216,"[COUSINS, COUSInS, COUSiNS, CoUSINS, CoUSInS, ...",6
2153,Cousin,4780,"[COUSIN, COUSIn, COUSiN, CoUSIN, CoUSIn, CoUSiN]",6
2417,Hoskins,4611,"[HOSKINS, HOSKInS, HOsKINS, HOsKInS, HoSKINS, ...",6
6119,Hoksins,1604,"[HOKSINS, HOKSInS, HOKSiNS, HoKSINS, HoKSInS, ...",6
6434,Sinkins,1591,"[SINKINS, SINKInS, SInKINS, SInKInS, SiNKINS, ...",6
7090,Hobin,1562,"[HOBIN, HOBIn, HOBiN, HoBIN, HoBIn, HoBiN]",6
8057,Cubbin,1530,"[CUBBIN, CUBBIn, CUBBiN, CuBBIN, CuBBIn, CuBBiN]",6
8167,Hoskin,1527,"[HOSKIN, HOSKIn, HOsKIN, HOsKIn, HoSKIN, HoSKIn]",6
8173,Hobbin,1526,"[HOBBIN, HOBBIn, HOBBiN, HoBBIN, HoBBIn, HoBBiN]",6
8394,Binnion,1516,"[BINNION, BINNiON, BInNION, BInNiON, BiNNION, ...",6
