# List Comparison
---
* Compares two arrays for the most likley match
* It then outputs the results into a dataframe

In [80]:
# import packages
import pandas as pd
import difflib as dl
import re

* Read in the data and create two dataframes

In [81]:
# filepath
data_file = '/Users/stephenminter/Desktop/FruitSampleData.xlsx'

#read in filepath and select base tab
df_base = pd.read_excel(data_file, sheet_name= 'base')
df_comparison = pd.read_excel(data_file, sheet_name= 'comparison')

* adds a lower case column for each array, this is to make the search more accruate.

In [82]:
# add a new column for each dataframe, set as a lowercase version of the data
df_base['lower_case'] = df_base.iloc[:, 0].str.lower()
df_comparison['lower_case'] = df_comparison.iloc[:, 0].str.lower()

* create a list from each lower case array, used in the for loop

In [83]:
# turning into a list, this will be used for the comparison data
list_base = df_base[df_base.columns[0]].to_list()
list_base_lower = df_base['lower_case'].to_list()
list_comparison = df_comparison['lower_case'].to_list()

# remove duplicates from list_comparison
list_comparison = list(dict.fromkeys(list_comparison))
list_comparison

['apple',
 'orange',
 'pear',
 'banana',
 'starfruit',
 'plum',
 'strawberry',
 'raseberry',
 'apple man',
 'acces',
 'access',
 'accespa',
 'berry',
 'stephen',
 'steve']

* for loop

In [84]:
# create an empty list and for loop
list_accumulation = []

for word in list_base_lower:
    matched_words = dl.get_close_matches(word,list_comparison, 3)
    list_accumulation.append(matched_words)

df = pd.DataFrame(list(zip(list_base, list_accumulation)))
df

Unnamed: 0,0,1
0,Apple,"[apple, apple man]"
1,Orange,[orange]
2,Pear,[pear]
3,Banana,[banana]
4,StarFruit,[starfruit]
5,Plum,[plum]
6,StrawBerry,"[strawberry, raseberry, berry]"
7,RaseBerry,"[raseberry, strawberry, berry]"
8,AccesPayBkaBla,[accespa]
9,stephen minter,[stephen]


In [85]:
#split_df = pd.DataFrame(df[1].to_list(), columns = ['match_01', 'match_02', 'match_03'])

# split the list into new columns
split_df = pd.DataFrame(df[1].to_list())
# concat both dataframes
df_output = pd.concat([df[0], split_df], axis=1)
# rename dataframes
df_output.columns = ['base', 'match_01', 'match_02', 'match_03']
df_output

Unnamed: 0,base,match_01,match_02,match_03
0,Apple,apple,apple man,
1,Orange,orange,,
2,Pear,pear,,
3,Banana,banana,,
4,StarFruit,starfruit,,
5,Plum,plum,,
6,StrawBerry,strawberry,raseberry,berry
7,RaseBerry,raseberry,strawberry,berry
8,AccesPayBkaBla,accespa,,
9,stephen minter,stephen,,


* create a function to apply sequence matcher over two columns

In [86]:
def apply_sm(s, c1, c2): 
    return dl.SequenceMatcher(None, s[c1], s[c2]).ratio()

* need to compare against a lowercase base

In [87]:
df_output['base_lower'] = df_output['base'].str.lower()
df_output = df_output[['base', 'base_lower', 'match_01', 'match_02', 'match_03']]
df_output.fillna("", inplace= True)

* add percentage likness for each column

In [88]:
df_output['match_01_perc'] = df_output.apply(apply_sm, c1='base_lower', c2='match_01', axis=1).round(3)
df_output['match_02_perc'] = df_output.apply(apply_sm, c1='base_lower', c2='match_02', axis=1).round(3)
df_output['match_03_perc'] = df_output.apply(apply_sm, c1='base_lower', c2='match_03', axis=1).round(3)
df_output.drop(columns= 'base_lower', inplace= True)
df_output

Unnamed: 0,base,match_01,match_02,match_03,match_01_perc,match_02_perc,match_03_perc
0,Apple,apple,apple man,,1.0,0.714,0.0
1,Orange,orange,,,1.0,0.0,0.0
2,Pear,pear,,,1.0,0.0,0.0
3,Banana,banana,,,1.0,0.0,0.0
4,StarFruit,starfruit,,,1.0,0.0,0.0
5,Plum,plum,,,1.0,0.0,0.0
6,StrawBerry,strawberry,raseberry,berry,1.0,0.737,0.667
7,RaseBerry,raseberry,strawberry,berry,1.0,0.737,0.714
8,AccesPayBkaBla,accespa,,,0.667,0.0,0.0
9,stephen minter,stephen,,,0.667,0.0,0.0


Function using re to find a word in a string

In [89]:
def word_list_checker(word, string_search):
    try:
        result = re.search(word, string_search, re.IGNORECASE).group()
    except:
        pass
    finally:
        return result

In [90]:
# filepath
data_file = '/Users/stephenminter/Desktop/strings.xlsx'

#read in filepath and select base tab
df_string = pd.read_excel(data_file, sheet_name= 'strings')
df_string

Unnamed: 0,Samples
0,this is licence year
1,license fee maintainence
2,fee hello may name is maintain
3,notthin in this one
4,this is a 5 year license


lambda to turn each row string into a list

In [91]:
string_splitter = lambda x: x.split(' ')
df_string['string_split'] = df_string['Samples'].apply(string_splitter)
df_string

Unnamed: 0,Samples,string_split
0,this is licence year,"[this, is, licence, year]"
1,license fee maintainence,"[license, fee, maintainence]"
2,fee hello may name is maintain,"[fee, hello, may, name, is, maintain]"
3,notthin in this one,"[notthin, in, this, one]"
4,this is a 5 year license,"[this, is, a, 5, year, license]"


In [109]:
search_words = ['license', 'fee', 'maintainence']

In [111]:
df_ls = df_string['string_split'].tolist()
df_ls

[['this', 'is', 'licence', 'year'],
 ['license', 'fee', 'maintainence'],
 ['fee', 'hello', 'may', 'name', 'is', 'maintain'],
 ['notthin', 'in', 'this', 'one'],
 ['this', 'is', 'a', '5', 'year', 'license']]

need to ingore case

In [132]:
ls = []
for word in df_ls[1]:
    mws = dl.get_close_matches(word, search_words,3)
    ls.append(mws)
ls

[['license'], ['fee'], ['maintainence']]