This notebook extracts all spelling and puncutation features from the set of queries found in SQS.

# Load Libraries

The following block of code loads all libraries needed for this notebook.

In [1]:
import csv
import pickle
import string
import textstat

import pandas as pd
import numpy as np

from langdetect import detect
from spellchecker import SpellChecker
from tqdm import tqdm

# Load Data Sets

This block of code loads the data sets and extracts all unique queries from both.

In [2]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
# allQueries = allSessions['query'].tolist() + allSessionsSQS['query'].tolist()
# allQueries = set(allQueries)

# allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
# allQueries = allSessionsSQS['query'].tolist()
# allQueries = np.array(allQueries)
# allQueries

allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
allSessionsSQS = allSessionsSQS['query'].drop_duplicates()
allQueries = allSessionsSQS.tolist()
setQueries = allQueries


In [3]:
len(allQueries)

1505

# Generate Misspelled List

Generates a list of commonly misspelled words by children from the KidSpell data set which is later used.

In [4]:
# kidsMispelled = []

# count = 0

# with open('KidSpell/Web_Search_Lab_Errors.csv', newline='') as csvfile:
#     spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
#     for row in spamreader:
#         if count == 0:
#             count += 1
#         else:
#             kidsMispelled.append(row[0])

# count = 0
# with open('KidSpell/Web_Search_Informal_Errors.csv', newline='') as csvfile:
#     spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
#     for row in spamreader:
#         if count == 0:
#             count += 1
#         else:
#             kidsMispelled.append(row[0])
    
# count = 0
# with open('KidSpell/Essay_Writing_Errors.csv', newline='') as csvfile:
#     spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
#     for row in spamreader:
#         if count == 0:
#             count += 1
#         else:
#             kidsMispelled.append(row[1])

# kidsMispelled = set(kidsMispelled)

In [5]:
kidsMispelled = []

count = 0

with open('KidSpell/Web_Search_Lab_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[0])

count = 0
with open('KidSpell/Web_Search_Informal_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[0])
    
count = 0
with open('KidSpell/Essay_Writing_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[1])

kidsMispelled = set(kidsMispelled)

In [6]:
len(kidsMispelled)

1134

# Extract Spelling Features

The following block of code extracts features related to spelling errors and stores them in a dataframe. 

In [7]:
spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []

netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                found = 0
                oneOff = 0
                kidsMis = 0
                for word in misspelled:
                    misspelledWords += ", " + word # -- concatinating the misspelled words
                    if word in kidsMispelled:
                        kidsMis +=1
                    candid = spell.candidates(word) # candidates() displays the set of words that are close to the word entered
#                     print(candid)
#                     print("-------start edits-----------------")
                    edits = spell.edit_distance_1(word)
#                     print(edits)
#                     print("-----------end edits-------------")
                    
                    for can in candid:
                        if can in edits:
                            oneOff += 1
                        break
#                     print(word +' Out')
                oneOffError.append(oneOff)
                spellingError.append(len(misspelled))
                kidsError.append(kidsMis)
#                 misspelledCol.append(misspelledWords)
                
            except:
#                 print(misspelled)
                oneOffError.append(-1)
                spellingError.append(-1)
                kidsError.append(-1)
#                 misspelledCol.append(misspelledWords)

        else:
            spellingError.append(0)
            oneOffError.append(0)
            kidsError.append(0)
#             misspelledCol.append("none")

        pbar.update()
    
spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
spelling['query'] = allQueries
spelling['offByOne'] = oneOffError
spelling['kidsError'] = kidsError
# spelling['ms'] = misspelledCol


100%|██████████| 1505/1505 [01:29<00:00, 16.88it/s]


In [8]:
# # ---- CHECKING ----

# spell = SpellChecker()

# spellingError = []
# oneOffError = []
# kidsError = []
# misspelledCol = []

# netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

# with tqdm(total = len(allQueries) ) as pbar:
#     for query in allQueries:
#         query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
#         website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
#         if not website:
#             misspelledWords = "";
#             try:
#                 lang = detect(query) # -- detects a language 
#                 misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
#                 found = 0
#                 oneOff = 0
#                 kidsMis = 0
#                 for word in misspelled:
#                     misspelledWords += " " + word # -- concatinating the misspelled words
#                     if word in kidsMispelled:
#                         kidsMis +=1
#                     candid = spell.candidates(word) # --- candidates() displays the set of words that are close to the word entered e.g: 'country', 'count', 'counter' if country is entered
#                     edits = spell.edit_distance_1(word) # -- Compute all strings that are one edit away from `word` 
                    
#                     for can in candid:
#                         if can in edits:
#                             oneOff += 1 # -- one letter off 
#                         break

#                 oneOffError.append(oneOff)
#                 spellingError.append(len(misspelled))
#                 kidsError.append(kidsMis)
#                 misspelledCol.append(misspelledWords)
                
#             except:
# #                 print(misspelled)
#                 oneOffError.append(-1)
#                 spellingError.append(-1)
#                 kidsError.append(-1)
#                 misspelledCol.append(misspelledWords)

#         else:
#             spellingError.append(0)
#             oneOffError.append(0)
#             kidsError.append(0)
#             misspelledCol.append("none")

#         pbar.update()
    
# spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
# spelling['query'] = allQueries
# spelling['offByOne'] = oneOffError
# spelling['kidsError'] = kidsError
# spelling['ms'] = misspelledCol


In [37]:
jj= 'ths is intentioal'

print(detect(jj))

kk=spell.unknown(jj.split(" "))
kk

en


{'intentioal', 'ths'}

In [68]:
candid = spell.candidates('ths') # candidates() displays the set of words that are close to the word entered
#                     print(candid)
#                     print("-------start edits-----------------")
edits = spell.edit_distance_1('ths')
#                     print(edits)
#                     print("-----------end edits-------------")
oneOff=0              
for can in candid:
    if can in edits:
        oneOff += 1
    break
oneOff

1

In [67]:
spell.candidates('ths')

{'tas',
 'tes',
 'tha',
 'thas',
 'the',
 'thes',
 'thi',
 'this',
 'tho',
 'thos',
 'thu',
 'thus',
 'thy',
 'tis',
 'tus'}

In [76]:
for j in range(5):
    print(j)
    if j ==0:
        print(j)
    break

0
0


In [72]:
spell.edit_distance_1('ths')

{"'hs",
 "'ths",
 'ahs',
 'aths',
 'bhs',
 'bths',
 'chs',
 'cths',
 'dhs',
 'dths',
 'ehs',
 'eths',
 'fhs',
 'fths',
 'ghs',
 'gths',
 'hhs',
 'hs',
 'hths',
 'hts',
 'ihs',
 'iths',
 'jhs',
 'jths',
 'khs',
 'kths',
 'lhs',
 'lths',
 'mhs',
 'mths',
 'nhs',
 'nths',
 'ohs',
 'oths',
 'phs',
 'pths',
 'qhs',
 'qths',
 'rhs',
 'rths',
 'shs',
 'sths',
 "t'hs",
 "t's",
 'tahs',
 'tas',
 'tbhs',
 'tbs',
 'tchs',
 'tcs',
 'tdhs',
 'tds',
 'tehs',
 'tes',
 'tfhs',
 'tfs',
 'tghs',
 'tgs',
 'th',
 "th'",
 "th's",
 'tha',
 'thas',
 'thb',
 'thbs',
 'thc',
 'thcs',
 'thd',
 'thds',
 'the',
 'thes',
 'thf',
 'thfs',
 'thg',
 'thgs',
 'thh',
 'thhs',
 'thi',
 'this',
 'thj',
 'thjs',
 'thk',
 'thks',
 'thl',
 'thls',
 'thm',
 'thms',
 'thn',
 'thns',
 'tho',
 'thos',
 'thp',
 'thps',
 'thq',
 'thqs',
 'thr',
 'thrs',
 'ths',
 "ths'",
 'thsa',
 'thsb',
 'thsc',
 'thsd',
 'thse',
 'thsf',
 'thsg',
 'thsh',
 'thsi',
 'thsj',
 'thsk',
 'thsl',
 'thsm',
 'thsn',
 'thso',
 'thsp',
 'thsq',
 'thsr',


In [74]:
'ths' in spell.edit_distance_1('ths')

True

In [10]:
for j in kk:
    print(spell.candidates(j))
#     print(spell.edit_distance_1(j))
    pass

{'thy', 'tus', 'thi', 'this', 'thas', 'tho', 'tes', 'tas', 'the', 'tha', 'thos', 'thes', 'tis', 'thus', 'thu'}


In [49]:
'' in kidsMispelled

True

In [11]:
spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []

netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                found = 0
                oneOff = 0
                kidsMis = 0
                for word in misspelled:
                    misspelledWords += ", " + word # -- concatinating the misspelled words
                    if word in kidsMispelled:
                        kidsMis +=1
                    candid = spell.candidates(word) # candidates() displays the set of words that are close to the word entered
#                     print(candid)
#                     print("-------start edits-----------------")
                    edits = spell.edit_distance_1(word)
#                     print(edits)
#                     print("-----------end edits-------------")
                    
                    for can in candid:
                        if can in edits:
                            oneOff += 1
                        break
#                     print(word +' Out')
                oneOffError.append(oneOff)
                spellingError.append(len(misspelled))
                kidsError.append(kidsMis)
#                 misspelledCol.append(misspelledWords)
                
            except:
#                 print(misspelled)
                oneOffError.append(-1)
                spellingError.append(-1)
                kidsError.append(-1)
#                 misspelledCol.append(misspelledWords)

        else:
            spellingError.append(0)
            oneOffError.append(0)
            kidsError.append(0)
#             misspelledCol.append("none")

        pbar.update()
    
spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
spelling['query'] = allQueries
spelling['offByOne'] = oneOffError
spelling['kidsError'] = kidsError
# spelling['ms'] = misspelledCol


['it consulting',
 'JP Morgan Chase data',
 'fun violence control constitutional']

In [41]:
allQueries[7]

'Leading Parhmacutical companyies and lobbying'

In [44]:
' ' in allQueries

False

In [None]:
for can in candid:
                        if can in edits:
                            oneOff += 1
                        break

In [100]:
spell = SpellChecker()

spellingError = [] # number of misspelled words in the query
oneOffError = []
kidsError = []  # number of erros found in kidsMispelled
misspelledCol = []
allErrors = [] # list of concatenated errors per query


netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']
with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                spellingError.append(len(misspelled))
                
                all_error = ', '.join(misspelled)
                allErrors.append(all_error)
                
                kidsMis = 0
                oneoff =0
                for word in misspelled:
                    if word in kidsMispelled:
                        kidsMis +=1
                        
                    candid = spell.candidates(word)
                    edits = spell.edit_distance_1(word)
                    
                    if list(candid)[0] in edits:
                        oneoff+=1
#                     for can in candid:
#                         if can in edits:
#                             oneoff+=1
#                         break
                        
                
                kidsError.append(kidsMis)
                oneOffError.append(oneoff)
            
            except:
                spellingError.append(-1)
                kidsError.append(-1)
                allErrors.append(-1)
                oneOffError.append(-1)
            
        else:
            spellingError.append(0)
            kidsError.append(0)
            allErrors.append(0)
            oneOffError.append(0)

        pbar.update()

100%|██████████| 1505/1505 [01:06<00:00, 22.65it/s]


In [102]:
len(oneOffError)

1505

In [97]:
allErrors[-5:]

['superhrycz', -1, '', 'oppure, supporta', '']

In [84]:
df =pd.DataFrame({
    'Query': allQueries,
    'kidsError': kidsError,
    'numSpellingErrors': spellingError,
    'ms': allErrors,
    'offByOne': oneOffError
})
df

ValueError: All arrays must be of the same length

In [82]:
'jp' in spell.edit_distance_1('jp')

True

In [80]:
df.query('	numSpellingErrors > 1')

Unnamed: 0,Query,kidsError,numSpellingErrors,ms,offByOne
7,Leading Parhmacutical companyies and lobbying,0,2,"parhmacutical, companyies",2
12,Merck lobbists,0,2,"lobbists, merck",2
38,Merck lobby -hpv,0,2,"merck, hpv",2
42,PhD in Business benifits,0,2,"phd, benifits",2
86,jp morgan chase compulational scientist,0,2,"jp, compulational",2
...,...,...,...,...,...
1430,lg 47lg50 review,0,2,"lg, 47lg50",2
1445,maynard cooper & gale p c,1,3,", p, c",2
1478,rave motion pictures ridgmar 13 fort worth tx,0,2,"tx, ridgmar",2
1481,razboi intru cuvant,0,3,"cuvant, razboi, intru",3


In [51]:
len(allQueries)

1505

In [None]:
spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []


netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']
with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                

In [None]:
edits = spell.edit_distance_1(word)

In [12]:
spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []

test = ['it consulting',
 'JP Morgan Chase data',
 'fun violence control jp constitutional']
test.append('we add www and http jp intentionally')
netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

for query in test:
    query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
    website = [mod for mod in netModifiers if(mod in query)]
    
    if not website:
        misspelledWords = "";
        
        lang = detect(query) # -- detects a language 
        print(query)
        misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
        print('length of',len(misspelled))
        found = 0
        oneOff = 0
        kidsMis = 0
        for word in misspelled:
            misspelledWords += " " + word  # -- concatinating the misspelled words
            print('word = ', list(word))
            print(list(misspelledWords))
            if word in kidsMispelled:
                kidsMis +=1
            candid = spell.candidates(word) # --- candidates() displays the set of words that are close to the word entered e.g: 'country', 'count', 'counter' if country is entered
            edits = spell.edit_distance_1(word) # -- Compute all strings that are one edit away from `word` 
                    
            for can in candid:
                if can in edits:
                    oneOff += 1 # -- one letter off 
                break

        oneOffError.append(oneOff)
        #print(oneOffError)
        print('final', misspelled)
        spellingError.append(len(misspelled))
        #print(spellingError)
        kidsError.append(kidsMis)
        #print(kidsError)
        misspelledCol.append(misspelledWords)
        print(misspelledCol)

it consulting
length of 0
final set()
['']
JP Morgan Chase data
length of 1
word =  ['j', 'p']
[' ', 'j', 'p']
final {'jp'}
['', ' jp']
fun violence control jp constitutional
length of 1
word =  ['j', 'p']
[' ', 'j', 'p']
final {'jp'}
['', ' jp', ' jp']


In [13]:
'' + ' ' + 'A'

' A'

In [14]:
website

['www', 'http']

In [15]:
# ---- CHECKING ----

spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []

netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                found = 0
                oneOff = 0
                kidsMis = 0
                for word in misspelled:
                    misspelledWords += " " + word # -- concatinating the misspelled words
                    
                    misspelledWords = ' '.join(misspelledWords.split())
                    
                    if word in kidsMispelled:
                        kidsMis +=1
                    candid = spell.candidates(word) # --- candidates() displays the set of words that are close to the word entered e.g: 'country', 'count', 'counter' if country is entered
                    edits = spell.edit_distance_1(word) # -- Compute all strings that are one edit away from `word` 
                    
                    for can in candid:
                        if can in edits:
                            oneOff += 1 # -- one letter off 
                        break

                oneOffError.append(oneOff)
                print('mispelled', misspelled)
                spellingError.append(len(misspelled))
                kidsError.append(kidsMis)
                misspelledCol.append(misspelledWords)
                
            except:
#                 print(misspelled)
                oneOffError.append(-1)
                spellingError.append(-1)
                kidsError.append(-1)
                misspelledCol.append(misspelledWords)

        else:
            spellingError.append(0)
            oneOffError.append(0)
            kidsError.append(0)
            misspelledCol.append("none")

        pbar.update()
    
spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
spelling['query'] = allQueries
spelling['offByOne'] = oneOffError
spelling['kidsError'] = kidsError
spelling['ms'] = misspelledCol


  0%|          | 0/1505 [00:00<?, ?it/s]

mispelled set()
mispelled {'jp'}
mispelled set()


  0%|          | 4/1505 [00:00<01:33, 16.03it/s]

mispelled {'swahili'}


  0%|          | 6/1505 [00:00<01:34, 15.89it/s]

mispelled {'kursk'}
mispelled set()
mispelled set()


  1%|          | 8/1505 [00:00<03:41,  6.76it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled {'lobbists', 'merck'}
mispelled set()
mispelled set()


  1%|          | 16/1505 [00:01<01:43, 14.39it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  2%|▏         | 32/1505 [00:01<00:54, 26.87it/s]

mispelled {'swahili'}
mispelled set()
mispelled {''}
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  3%|▎         | 45/1505 [00:01<00:37, 39.25it/s]

mispelled set()
mispelled set()
mispelled {'merck', 'hpv'}
mispelled set()
mispelled {'dehumidifiers'}
mispelled set()
mispelled {'phd', 'benifits'}
mispelled set()
mispelled set()
mispelled set()


  3%|▎         | 50/1505 [00:02<00:51, 28.17it/s]

mispelled {'swahili'}
mispelled set()
mispelled {'kursk'}
mispelled {'airpor'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  4%|▎         | 55/1505 [00:02<01:42, 14.15it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()


  4%|▍         | 67/1505 [00:03<01:14, 19.20it/s]

mispelled set()
mispelled set()
mispelled {'kenyan'}
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  6%|▌         | 84/1505 [00:03<00:46, 30.60it/s]

mispelled {'kiursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp', 'compulational'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  7%|▋         | 100/1505 [00:03<00:31, 45.19it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}


  7%|▋         | 106/1505 [00:04<00:39, 35.18it/s]

mispelled {'voip'}
mispelled set()
mispelled {'keto'}
mispelled set()


  7%|▋         | 111/1505 [00:04<00:47, 29.57it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()


  8%|▊         | 115/1505 [00:04<00:54, 25.35it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  8%|▊         | 127/1505 [00:05<00:55, 24.91it/s]

mispelled {'rheumatology'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'voip'}
mispelled {'sillicon'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  9%|▉         | 141/1505 [00:05<00:40, 33.80it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'dutta'}
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {''}


 10%|█         | 154/1505 [00:05<00:33, 40.46it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'benifits'}
mispelled set()


 11%|█         | 164/1505 [00:06<00:53, 24.95it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'citites'}


 11%|█▏        | 172/1505 [00:06<00:54, 24.61it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled {'voip'}


 12%|█▏        | 185/1505 [00:07<00:35, 37.10it/s]

mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'comparaison'}
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 13%|█▎        | 194/1505 [00:07<00:36, 35.62it/s]

mispelled {'dutta'}
mispelled {'merck'}
mispelled {'swahili'}
mispelled set()
mispelled

 13%|█▎        | 200/1505 [00:07<00:39, 33.17it/s]

 set()
mispelled set()
mispelled set()
mispelled {'voip'}
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled {'pagents'}
mispelled set()
mispelled set()
mispelled set()


 14%|█▍        | 210/1505 [00:08<00:59, 21.84it/s]

mispelled {'', 'suny'}


 14%|█▍        | 214/1505 [00:08<01:16, 16.85it/s]

mispelled set()
mispelled set()
mispelled {''}


 14%|█▍        | 217/1505 [00:09<01:53, 11.30it/s]

mispelled set()
mispelled {'arther'}


 15%|█▍        | 220/1505 [00:09<01:56, 11.02it/s]

mispelled {'voip'}
mispelled set()


 15%|█▌        | 228/1505 [00:10<01:26, 14.83it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()


 16%|█▌        | 243/1505 [00:10<00:48, 25.82it/s]

mispelled {'jpmorgan'}
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {''}
mispelled set()


 16%|█▋        | 248/1505 [00:10<00:52, 24.09it/s]

mispelled {'vanuatu'}
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled {''}
mispelled set()


 17%|█▋        | 258/1505 [00:11<00:55, 22.43it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()


 18%|█▊        | 273/1505 [00:11<00:37, 32.89it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 18%|█▊        | 278/1505 [00:11<00:42, 28.70it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'microsoft', 'jp'}
mispelled {'dehumidifiers'}


 19%|█▊        | 282/1505 [00:12<01:14, 16.37it/s]

mispelled {'swahili'}
mispelled {'swahili'}


 19%|█▉        | 285/1505 [00:12<01:16, 15.89it/s]

mispelled set()
mispelled set()


 19%|█▉        | 288/1505 [00:13<01:36, 12.64it/s]

mispelled {'jpmorgan'}
mispelled {'kursk'}
mispelled set()
mispelled set()


 20%|█▉        | 294/1505 [00:13<01:44, 11.62it/s]

mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled {'kursk'}


 20%|█▉        | 296/1505 [00:14<01:47, 11.26it/s]

mispelled {'', 'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled set()


 20%|██        | 306/1505 [00:14<00:57, 20.87it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 22%|██▏       | 325/1505 [00:15<00:41, 28.45it/s]

mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled {'', 'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 22%|██▏       | 330/1505 [00:15<00:44, 26.19it/s]

mispelled {'swahili'}
mispelled {'', 'kursk'}


 23%|██▎       | 340/1505 [00:15<00:42, 27.52it/s]

mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}


 23%|██▎       | 344/1505 [00:15<00:53, 21.89it/s]

mispelled {'microsoft', 'jp'}
mispelled set()
mispelled set()


 23%|██▎       | 348/1505 [00:16<01:13, 15.65it/s]

mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 24%|██▎       | 355/1505 [00:16<01:21, 14.19it/s]

mispelled set()
mispelled set()
mispelled set()


 24%|██▍       | 359/1505 [00:17<01:21, 14.02it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 25%|██▍       | 376/1505 [00:18<01:11, 15.83it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()


 25%|██▌       | 380/1505 [00:19<01:42, 11.02it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 27%|██▋       | 401/1505 [00:20<00:51, 21.50it/s]

mispelled {'geographicals'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'dehumidifiers'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled {'dutta'}
mispelled set()
mispelled set()


 27%|██▋       | 406/1505 [00:20<00:46, 23.43it/s]

mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()


 28%|██▊       | 420/1505 [00:20<00:31, 33.97it/s]

mispelled {'skytop'}
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'voip'}
mispelled set()


 28%|██▊       | 428/1505 [00:20<00:31, 34.05it/s]

mispelled {'kenyan', 'swahili'}
mispelled set()
mispelled {'turnbines'}
mispelled {'swahili'}
mispelled set()


 29%|██▉       | 441/1505 [00:21<00:46, 22.87it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'dutta'}
mispelled set()
mispelled {'ncaa'}
mispelled {'kursk'}
mispelled {'pocono'}


 30%|██▉       | 448/1505 [00:21<00:37, 28.18it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'benefti', 'phd'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled

 30%|███       | 453/1505 [00:21<00:34, 30.07it/s]

 {'kursk', 'toepedo'}
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled {''}
mispelled {'vs'}
mispelled set()
mispelled set()
mispelled {''}
mispelled {'pocono'}


 31%|███       | 464/1505 [00:22<00:46, 22.30it/s]

mispelled {'kenyan'}
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled {'cuo'}
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 32%|███▏      | 477/1505 [00:22<00:36, 27.80it/s]

mispelled set()
mispelled {'dc'}


 32%|███▏      | 488/1505 [00:23<00:35, 28.87it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()


 33%|███▎      | 492/1505 [00:23<00:57, 17.48it/s]

mispelled {'kursk', 'casualitys'}
mispelled set()


 33%|███▎      | 495/1505 [00:24<01:12, 13.98it/s]

mispelled set()
mispelled {'ingeredients'}


 33%|███▎      | 503/1505 [00:24<00:55, 18.07it/s]

mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'vs'}
mispelled set()
mispelled set()
mispelled set()


 34%|███▍      | 517/1505 [00:25<00:37, 26.27it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 35%|███▍      | 521/1505 [00:25<00:55, 17.62it/s]

mispelled set()
mispelled {'pocono'}
mispelled {''}
mispelled set()


 35%|███▍      | 524/1505 [00:25<01:08, 14.29it/s]

mispelled set()
mispelled set()
mispelled {'merck'}
mispelled {'pocono'}
mispelled set()


 36%|███▌      | 539/1505 [00:26<00:40, 24.03it/s]

mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'exsists'}
mispelled set()
mispelled set()


 36%|███▌      | 545/1505 [00:26<00:41, 22.89it/s]

mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()


 37%|███▋      | 550/1505 [00:26<00:44, 21.55it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()


 37%|███▋      | 555/1505 [00:27<00:45, 21.00it/s]

mispelled {'pocono'}
mispelled set()


 37%|███▋      | 558/1505 [00:27<00:59, 15.99it/s]

mispelled {'diease'}


 37%|███▋      | 561/1505 [00:27<01:03, 14.77it/s]

mispelled set()
mispelled {'pocono'}
mispelled {'suny'}


 37%|███▋      | 564/1505 [00:28<01:19, 11.90it/s]

mispelled {'boolywood'}


 38%|███▊      | 566/1505 [00:28<01:47,  8.74it/s]

mispelled {'suny'}
mispelled {'r'}
mispelled set()


 38%|███▊      | 578/1505 [00:29<00:56, 16.55it/s]

mispelled {'telefonas'}
mispelled {'governent'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'vs'}
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()


 39%|███▉      | 589/1505 [00:29<00:37, 24.68it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}


 40%|███▉      | 601/1505 [00:30<00:46, 19.48it/s]

mispelled {'jp'}
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled {'placess', 'toursim'}


 41%|████      | 611/1505 [00:31<00:52, 16.92it/s]

mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'byrant'}
mispelled set()
mispelled set()
mispelled set()
mispelled {''}


 41%|████      | 619/1505 [00:31<00:46, 19.19it/s]

mispelled {'assement'}
mispelled set()
mispelled {'voip'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 42%|████▏     | 631/1505 [00:31<00:38, 22.75it/s]

mispelled {'swahili'}
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled {'', 'jp'}
mispelled {''}
mispelled {'merck'}


 42%|████▏     | 639/1505 [00:31<00:32, 26.62it/s]

mispelled set()
mispelled {'voip'}
mispelled set()
mispelled set()
mispelled {'', 'kursk'}


 43%|████▎     | 643/1505 [00:32<00:43, 19.87it/s]

mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()


 43%|████▎     | 646/1505 [00:32<00:58, 14.63it/s]

mispelled set()
mispelled set()
mispelled {'develpmental'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled {'suny'}


 45%|████▍     | 671/1505 [00:33<00:34, 24.02it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'arther'}
mispelled {'jp'}
mispelled {'ingridients'}
mispelled {'merck'}
mispelled set()


 46%|████▌     | 688/1505 [00:34<00:23, 35.21it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled {'ct'}
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled {'countrys'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled {'voip'}
mispelled set()
mispelled {'voip'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 46%|████▌     | 694/1505 [00:34<00:35, 23.13it/s]

mispelled {'vs'}
mispelled {'swahili'}
mispelled {'merck'}
mispelled

 46%|████▋     | 699/1505 [00:34<00:35, 22.70it/s]

 set()
mispelled {'nyc'}
mispelled {'phd'}
mispelled set()


 47%|████▋     | 703/1505 [00:35<00:36, 22.02it/s]

mispelled set()


 47%|████▋     | 707/1505 [00:35<00:38, 20.48it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 48%|████▊     | 722/1505 [00:35<00:24, 31.86it/s]

mispelled {'hivaid'}
mispelled {'dehumidifiers'}
mispelled set()
mispelled {'', 'exsists'}
mispelled {'dehumidifiers'}
mispelled set()
mispelled set()
mispelled {'dutta'}
mispelled set()
mispelled {'somking'}


 48%|████▊     | 727/1505 [00:35<00:27, 28.64it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled {'hydoelectric'}
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()


 49%|████▊     | 731/1505 [00:36<00:43, 17.96it/s]

mispelled {'pocono'}


 49%|████▉     | 743/1505 [00:37<00:38, 19.63it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled {'dehumidifiers'}
mispelled set()


 50%|████▉     | 749/1505 [00:37<00:31, 23.93it/s]

mispelled set()
mispelled {'jp'}
mispelled set()
mispelled {'hobart'}
mispelled set()
mispelled set()
mispelled

 50%|█████     | 754/1505 [00:37<00:29, 25.42it/s]

 set()
mispelled {'pocono'}
mispelled {'pocono'}
mispelled set()
mispelled set()


 50%|█████     | 759/1505 [00:37<00:31, 24.03it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()


 51%|█████     | 771/1505 [00:38<00:26, 27.98it/s]

mispelled set()
mispelled {'suny'}
mispelled {'phd'}
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 52%|█████▏    | 778/1505 [00:38<00:20, 34.77it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 52%|█████▏    | 788/1505 [00:38<00:26, 26.87it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()


 53%|█████▎    | 792/1505 [00:39<00:41, 17.25it/s]

mispelled set()
mispelled set()


 53%|█████▎    | 797/1505 [00:40<01:13,  9.58it/s]

mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled

 53%|█████▎    | 803/1505 [00:40<00:48, 14.44it/s]

 {'voip'}
mispelled {'jp'}
mispelled {'arther'}


 54%|█████▎    | 806/1505 [00:40<00:49, 14.22it/s]

mispelled {'minotiy'}
mispelled set()
mispelled set()
mispelled {'traiditional', 'swahili'}
mispelled set()


 54%|█████▍    | 812/1505 [00:41<00:48, 14.20it/s]

mispelled {'kursk'}
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()


 54%|█████▍    | 816/1505 [00:41<00:40, 16.99it/s]

mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled {'dutta'}
mispelled set()
mispelled set()


 55%|█████▍    | 827/1505 [00:41<00:29, 22.66it/s]

mispelled set()


 55%|█████▌    | 830/1505 [00:42<00:51, 13.12it/s]

mispelled {'phd'}


 56%|█████▌    | 836/1505 [00:42<00:56, 11.83it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'voip'}
mispelled {'kursk'}


 56%|█████▌    | 841/1505 [00:43<01:05, 10.17it/s]

mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled {'kursk'}
mispelled set()
mispelled set()


 57%|█████▋    | 851/1505 [00:44<00:46, 14.07it/s]

mispelled set()
mispelled {''}
mispelled set()
mispelled {'pocono'}
mispelled {'phds'}
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 57%|█████▋    | 856/1505 [00:44<00:42, 15.17it/s]

mispelled {'pagents'}


 58%|█████▊    | 868/1505 [00:45<00:28, 21.98it/s]

mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled {'vs'}
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 59%|█████▊    | 881/1505 [00:45<00:21, 29.20it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 59%|█████▉    | 886/1505 [00:45<00:29, 20.69it/s]

mispelled {'roadtrip'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()


 59%|█████▉    | 890/1505 [00:45<00:27, 22.17it/s]

mispelled set()
mispelled {'kursk'}
mispelled {'pocono'}
mispelled set()


 60%|██████    | 903/1505 [00:46<00:24, 24.37it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'', 'pagents'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}


 60%|██████    | 908/1505 [00:47<00:38, 15.42it/s]

mispelled set()
mispelled {'swahili'}


 61%|██████    | 915/1505 [00:47<00:29, 20.26it/s]

mispelled {'', 'jp'}
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}


 61%|██████▏   | 923/1505 [00:47<00:29, 19.44it/s]

mispelled {'pocono'}
mispelled {'businesss', 'phd'}
mispelled set()
mispelled set()
mispelled set()


 62%|██████▏   | 929/1505 [00:48<00:29, 19.37it/s]

mispelled set()
mispelled {'phd'}


 62%|██████▏   | 932/1505 [00:48<00:32, 17.85it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()


 63%|██████▎   | 942/1505 [00:48<00:30, 18.74it/s]

mispelled {'kenyan'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'autralia'}
mispelled set()
mispelled {'suny'}
mispelled {'swahili'}


 63%|██████▎   | 946/1505 [00:49<00:47, 11.89it/s]

mispelled set()
mispelled {'pocono'}
mispelled {'coatings'}
mispelled {'phd'}
mispelled set()


 64%|██████▍   | 960/1505 [00:50<00:26, 20.79it/s]

mispelled {'swahili'}
mispelled set()
mispelled {'voip'}
mispelled {'dutta'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}


 64%|██████▍   | 969/1505 [00:50<00:25, 21.40it/s]

mispelled {'dutta'}
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 65%|██████▍   | 973/1505 [00:51<00:35, 14.88it/s]

mispelled set()


 65%|██████▍   | 976/1505 [00:51<00:37, 14.19it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {''}
mispelled {'exsist'}


 65%|██████▌   | 981/1505 [00:51<00:32, 16.33it/s]

mispelled {'kiursk'}


 65%|██████▌   | 984/1505 [00:52<01:07,  7.74it/s]

mispelled {'swahili'}


 66%|██████▌   | 996/1505 [00:53<00:39, 12.97it/s]

mispelled {'voip'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled {'kenyan'}
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()


 67%|██████▋   | 1013/1505 [00:53<00:18, 26.30it/s]

mispelled {''}
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled {'vs'}
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled {'purebreds'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 68%|██████▊   | 1026/1505 [00:54<00:18, 26.08it/s]

mispelled {'filmfare'}
mispelled {'faq'}
mispelled {'kursk'}
mispelled set()
mispelled {'kwh'}
mispelled {'effieiency'}
mispelled

 69%|██████▊   | 1032/1505 [00:54<00:15, 30.95it/s]

 set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}


 69%|██████▉   | 1037/1505 [00:54<00:28, 16.15it/s]

mispelled set()


 69%|██████▉   | 1045/1505 [00:55<00:29, 15.73it/s]

mispelled {'voip'}
mispelled set()
mispelled set()
mispelled {'vs'}
mispelled set()
mispelled {''}
mispelled set()


 70%|██████▉   | 1048/1505 [00:55<00:26, 17.22it/s]

mispelled set()
mispelled {'voip'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'vs', 'voip'}
mispelled set()
mispelled {'voip'}
mispelled set()


 70%|███████   | 1056/1505 [00:55<00:22, 20.11it/s]

mispelled {'roadtrip', 'v'}
mispelled {'merck'}
mispelled {'merck'}
mispelled set()
mispelled {''}
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()


 71%|███████   | 1068/1505 [00:56<00:25, 17.24it/s]

mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 71%|███████▏  | 1074/1505 [00:56<00:22, 18.84it/s]

mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled

 72%|███████▏  | 1083/1505 [00:57<00:15, 26.85it/s]

 set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'vs'}
mispelled {''}
mispelled {'policys', 'dutta'}
mispelled {'programes'}


 73%|███████▎  | 1097/1505 [00:57<00:10, 38.10it/s]

mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'attractios'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kenyan'}
mispelled set()


 73%|███████▎  | 1103/1505 [00:57<00:15, 26.54it/s]

mispelled set()
mispelled set()
mispelled {'voip'}
mispelled set()


 74%|███████▎  | 1108/1505 [00:58<00:20, 18.96it/s]

mispelled set()
mispelled set()
mispelled set()


 74%|███████▍  | 1115/1505 [00:58<00:25, 15.26it/s]

mispelled {'i75'}
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()


 74%|███████▍  | 1120/1505 [00:59<00:34, 11.25it/s]

mispelled set()


 75%|███████▍  | 1122/1505 [00:59<00:36, 10.52it/s]

mispelled set()


 75%|███████▌  | 1130/1505 [01:00<00:25, 14.81it/s]

mispelled {'phd'}
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk', 'wordwide'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 76%|███████▌  | 1138/1505 [01:00<00:18, 19.51it/s]

mispelled set()
mispelled {'swahili'}
mispelled {''}


 76%|███████▌  | 1141/1505 [01:00<00:19, 18.47it/s]

mispelled {'swahili'}
mispelled set()


 76%|███████▌  | 1144/1505 [01:00<00:21, 16.69it/s]

mispelled {'dehumidifiers', 'condiser'}
mispelled set()
mispelled {'phd'}
mispelled set()


 76%|███████▌  | 1146/1505 [01:01<00:26, 13.58it/s]

mispelled set()
mispelled set()


 77%|███████▋  | 1154/1505 [01:01<00:21, 16.27it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled {''}


 77%|███████▋  | 1157/1505 [01:02<00:38,  9.13it/s]

mispelled {'adisadvangtages'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'connecticu'}


 78%|███████▊  | 1167/1505 [01:03<00:26, 12.67it/s]

mispelled set()
mispelled {'phd'}
mispelled set()
mispelled set()


 78%|███████▊  | 1170/1505 [01:03<00:25, 13.12it/s]

mispelled {'swahili'}
mispelled {'pocono'}
mispelled set()


 78%|███████▊  | 1172/1505 [01:03<00:36,  9.24it/s]

mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'countrys'}


 79%|███████▉  | 1186/1505 [01:04<00:15, 20.12it/s]

mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled {'vs'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 79%|███████▉  | 1194/1505 [01:04<00:11, 27.55it/s]

mispelled {''}
mispelled set()
mispelled set()
mispelled {'phd'}


 80%|███████▉  | 1199/1505 [01:04<00:12, 24.62it/s]

mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled {'kursk'}


 80%|███████▉  | 1203/1505 [01:04<00:13, 21.87it/s]

mispelled {'kursk'}
mispelled set()
mispelled {'nokia', 'n71'}


 80%|████████  | 1207/1505 [01:05<00:23, 12.83it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'i9'}


 81%|████████  | 1216/1505 [01:06<00:21, 13.39it/s]

mispelled set()
mispelled {'onetpl'}
mispelled set()
mispelled set()
mispelled {'largo'}
mispelled set()
mispelled set()


 81%|████████▏ | 1224/1505 [01:06<00:26, 10.66it/s]

mispelled set()


 82%|████████▏ | 1230/1505 [01:07<00:25, 10.72it/s]

mispelled set()
mispelled {'inovations'}
mispelled {'ppmd'}
mispelled set()


 82%|████████▏ | 1241/1505 [01:07<00:13, 20.02it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'penney'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 83%|████████▎ | 1254/1505 [01:09<00:16, 15.24it/s]

mispelled {'suncellular'}
mispelled set()
mispelled {'tmax'}
mispelled {'tube9'}
mispelled set()
mispelled set()
mispelled {'tanay'}
mispelled set()
mispelled set()
mispelled {'testosteron'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 84%|████████▍ | 1269/1505 [01:09<00:10, 22.48it/s]

mispelled {'tougaloo'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'usfsa'}
mispelled set()
mispelled set()
mispelled set()


 85%|████████▍ | 1274/1505 [01:09<00:11, 20.91it/s]

mispelled {'valois'}
mispelled set()


 85%|████████▍ | 1278/1505 [01:11<00:28,  7.99it/s]

mispelled set()
mispelled set()
mispelled set()


 85%|████████▌ | 1282/1505 [01:11<00:26,  8.40it/s]

mispelled set()


 85%|████████▌ | 1285/1505 [01:12<00:33,  6.61it/s]

mispelled {'amarillo'}
mispelled set()


 86%|████████▌ | 1293/1505 [01:13<00:24,  8.80it/s]

mispelled set()
mispelled {'amaz0n'}
mispelled set()
mispelled set()
mispelled {'amc'}
mispelled {'amccom'}
mispelled set()


 86%|████████▌ | 1295/1505 [01:13<00:24,  8.45it/s]

mispelled set()


 86%|████████▌ | 1297/1505 [01:14<00:28,  7.28it/s]

mispelled {'outfitters', 'egael'}


 87%|████████▋ | 1305/1505 [01:14<00:15, 13.06it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'gps'}
mispelled set()
mispelled

 87%|████████▋ | 1311/1505 [01:14<00:11, 17.01it/s]

 set()
mispelled {'rc'}
mispelled {'90s', '80s'}
mispelled set()
mispelled set()
mispelled {'danville'}


 87%|████████▋ | 1314/1505 [01:16<00:32,  5.85it/s]

mispelled {'mennonite'}


 88%|████████▊ | 1322/1505 [01:16<00:17, 10.46it/s]

mispelled {'betley'}
mispelled set()
mispelled set()
mispelled {'bg'}
mispelled {'bgz', 'pl'}
mispelled {'biancas'}


 88%|████████▊ | 1325/1505 [01:16<00:16, 11.14it/s]

mispelled {'bibbero'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 88%|████████▊ | 1329/1505 [01:17<00:17, 10.20it/s]

mispelled {'bietet', 'anderem'}


 88%|████████▊ | 1331/1505 [01:19<00:45,  3.85it/s]

mispelled {'lemoyne'}


 89%|████████▊ | 1333/1505 [01:19<00:47,  3.61it/s]

mispelled set()
mispelled {'tshirts'}
mispelled set()
mispelled set()


 89%|████████▉ | 1338/1505 [01:20<00:33,  5.04it/s]

mispelled {'gta'}
mispelled {'ps2'}
mispelled set()


 89%|████████▉ | 1342/1505 [01:21<00:35,  4.54it/s]

mispelled {'csc'}
mispelled {'ct'}
mispelled set()
mispelled set()


 90%|████████▉ | 1347/1505 [01:21<00:24,  6.34it/s]

mispelled {'cupoons'}


 90%|████████▉ | 1349/1505 [01:21<00:25,  6.13it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 90%|█████████ | 1357/1505 [01:22<00:19,  7.53it/s]

mispelled {'cvs'}


 90%|█████████ | 1359/1505 [01:23<00:22,  6.37it/s]

mispelled {'lummis'}
mispelled {'fanatico'}


 91%|█████████ | 1369/1505 [01:23<00:08, 15.74it/s]

mispelled {'fansnap'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'fco'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 91%|█████████ | 1372/1505 [01:23<00:09, 14.53it/s]

mispelled {'fedricks', 'hollwood'}
mispelled set()


 91%|█████████▏| 1375/1505 [01:24<00:08, 14.45it/s]

mispelled set()
mispelled set()


 92%|█████████▏| 1384/1505 [01:24<00:06, 19.99it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'eten', 'gezond', 'recepten'}
mispelled {'ghi', 'hmo'}


 92%|█████████▏| 1390/1505 [01:25<00:09, 11.56it/s]

mispelled {'warfighter'}
mispelled {'ghs'}
mispelled set()
mispelled {'giausd'}
mispelled set()
mispelled set()
mispelled set()


 93%|█████████▎| 1394/1505 [01:25<00:09, 11.20it/s]

mispelled {'bunkbeds'}
mispelled {'hershy'}
mispelled set()


 93%|█████████▎| 1399/1505 [01:27<00:21,  4.90it/s]

mispelled set()


 93%|█████████▎| 1401/1505 [01:27<00:24,  4.30it/s]

mispelled set()
mispelled set()


 93%|█████████▎| 1403/1505 [01:28<00:21,  4.80it/s]

mispelled set()
mispelled set()
mispelled {'insectors'}


 93%|█████████▎| 1407/1505 [01:29<00:20,  4.77it/s]

mispelled set()
mispelled set()


 94%|█████████▍| 1414/1505 [01:30<00:14,  6.29it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'lalatx'}
mispelled set()
mispelled set()
mispelled set()


 94%|█████████▍| 1418/1505 [01:30<00:11,  7.78it/s]

mispelled {'langham'}
mispelled set()
mispelled set()


 94%|█████████▍| 1422/1505 [01:31<00:16,  5.08it/s]

mispelled {'merritt', 'leshawn'}
mispelled {'leupold'}
mispelled set()
mispelled set()


 95%|█████████▍| 1427/1505 [01:32<00:12,  6.34it/s]

mispelled {'lexibook'}


 95%|█████████▍| 1428/1505 [01:32<00:12,  6.04it/s]

mispelled {'lexmark'}


 95%|█████████▌| 1431/1505 [01:32<00:11,  6.49it/s]

mispelled set()


 96%|█████████▌| 1439/1505 [01:33<00:04, 13.98it/s]

mispelled {'vs'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'mgmt'}
mispelled set()


 96%|█████████▌| 1441/1505 [01:33<00:04, 13.10it/s]

mispelled {'mactan'}
mispelled set()


 96%|█████████▌| 1445/1505 [01:34<00:07,  7.92it/s]

mispelled {'maggiore'}
mispelled set()
mispelled {'', 'p', 'c'}
mispelled set()
mispelled {'mbam'}
mispelled {'mbt'}


 96%|█████████▋| 1450/1505 [01:34<00:04, 11.48it/s]

mispelled {'mcafee'}


 96%|█████████▋| 1452/1505 [01:35<00:07,  6.88it/s]

mispelled {'mcdonalds'}


 97%|█████████▋| 1454/1505 [01:35<00:07,  7.04it/s]

mispelled {'mcgregor'}
mispelled {'mcs'}
mispelled set()


 97%|█████████▋| 1456/1505 [01:35<00:07,  6.54it/s]

mispelled {'odonnell'}
mispelled set()


 97%|█████████▋| 1460/1505 [01:36<00:08,  5.11it/s]

mispelled set()
mispelled set()


 98%|█████████▊| 1471/1505 [01:37<00:03,  9.00it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'ananymous'}
mispelled {'pregancy'}
mispelled set()
mispelled set()
mispelled set()


 98%|█████████▊| 1474/1505 [01:39<00:05,  5.43it/s]

mispelled set()


 98%|█████████▊| 1476/1505 [01:39<00:06,  4.38it/s]

mispelled set()


 98%|█████████▊| 1479/1505 [01:41<00:07,  3.71it/s]

mispelled {'symone'}


 98%|█████████▊| 1482/1505 [01:42<00:07,  2.90it/s]

mispelled {'cuvant', 'razboi', 'intru'}
mispelled {'rca'}


 99%|█████████▊| 1484/1505 [01:43<00:07,  2.96it/s]

mispelled set()
mispelled {'nj'}
mispelled set()


 99%|█████████▉| 1488/1505 [01:43<00:03,  4.60it/s]

mispelled {'saratoga'}


 99%|█████████▉| 1495/1505 [01:44<00:01,  8.07it/s]

mispelled {'satureday'}
mispelled {'savannh'}
mispelled set()
mispelled {'sc'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


100%|█████████▉| 1501/1505 [01:44<00:00, 10.18it/s]

mispelled {'superbowl'}


100%|██████████| 1505/1505 [01:45<00:00, 14.33it/s]

mispelled set()
mispelled {'oppure', 'supporta'}
mispelled set()





In [16]:
spelling.head(15)

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError,ms
0,0,it consulting,0,0,
1,1,JP Morgan Chase data,1,0,jp
2,0,fun violence control constitutional,0,0,
3,1,swahili food traditional,0,0,swahili
4,1,Russian nuclear submarine Kursk politics russia,0,0,kursk
5,0,growth in bollywood,0,0,
6,0,junk food tax cons,0,0,
7,-1,Leading Parhmacutical companyies and lobbying,-1,-1,parhmacutical
8,0,What were the names of the victims?,0,0,
9,0,your own technology business,0,0,


In [17]:
cc = 'You hate a ddog'
m = spell.unknown(cc.split(" "))
m

{'ddog'}

In [18]:
dd = 'how god exsists'
m = spell.unknown(dd.split(" "))
m

{'exsists'}

In [19]:
spelling

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError,ms
0,0,it consulting,0,0,
1,1,JP Morgan Chase data,1,0,jp
2,0,fun violence control constitutional,0,0,
3,1,swahili food traditional,0,0,swahili
4,1,Russian nuclear submarine Kursk politics russia,0,0,kursk
...,...,...,...,...,...
1500,1,superbowl sunday?,0,0,superbowl
1501,-1,"superhry,cz",-1,-1,superhrycz
1502,0,superior motors,0,0,
1503,2,supporta Java oppure,1,0,oppure supporta


In [20]:
spelling['kidsError'].unique()

array([ 0, -1,  1])

In [21]:
spelling.ms[631]

'jp'

In [22]:
spelling[spelling['kidsError']==1]

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError,ms
28,1,mba salaries,1,1,
29,1,Connecticut Fire Academy skills,1,1,
139,1,what is collagen vascular disease,1,1,
145,1,Connecticut Fire Academy,1,1,
150,1,Connecticut Fire Academy training sessions,1,1,
184,1,great road trip ideas,1,1,
210,2,scholarships SUNY,2,1,suny
215,1,Indian Policies + Miss Universe,1,1,
241,1,women and minorities percentage setting,1,1,
251,1,what is collagen vascular,1,1,


In [23]:
# len(s.edit_distance_2('world'))

In [24]:
q = "what is the nme of www"
netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']
w = [mod for mod in netModifiers if(mod in q)]
w

['www']

In [25]:
lan = 'oppure'
detect(lan)

'no'

In [26]:
s = SpellChecker()

mis = s.unknown(['countr', 'hapen'])


for w in mis:
    print(w, s.correction(w))
    
    print(s.candidates(w))
 

hapen happen
{'hagen', 'haben', 'hacen', 'haven', 'happen'}
countr country
{'county', 'country', 'counter', 'count', 'counts'}


In [27]:
[w for w in mis]

['hapen', 'countr']

In [28]:
print(word)

supporta


In [29]:
d = 'Ibra, and Micheal!' 
print(d.translate(str.maketrans('', '', string.punctuation)))

Ibra and Micheal


# Extract Punctuation And Casing Features

The following block of code extracts spelling and casing features before adding them to the dataframe. 

In [30]:
invalidcharacters= set(['!', ',', '.', '?'])
punct = []
casing = []
with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:

        if any(char in invalidcharacters for char in query):
            if any(substring in query for substring in netModifiers):
                punct.append(0)
            else:
                punct.append(1)
        else:
            punct.append(0)

        if query.islower():
            casing.append(0)
        else:
            casing.append(1)
        pbar.update()
        
spelling['punct'] = punct
spelling['casing'] = casing



100%|██████████| 1505/1505 [00:00<00:00, 247323.10it/s]


# Return Feature Set

Returns dataframe with spelling and punctuation features.

In [31]:
pickle.dump(spelling, open( "Pickles/SPFeat.p", "wb" ) )



In [32]:
spelling.head(15)

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError,ms,punct,casing
0,0,it consulting,0,0,,0,0
1,1,JP Morgan Chase data,1,0,jp,0,1
2,0,fun violence control constitutional,0,0,,0,0
3,1,swahili food traditional,0,0,swahili,0,0
4,1,Russian nuclear submarine Kursk politics russia,0,0,kursk,0,1
5,0,growth in bollywood,0,0,,0,0
6,0,junk food tax cons,0,0,,0,0
7,-1,Leading Parhmacutical companyies and lobbying,-1,-1,parhmacutical,0,1
8,0,What were the names of the victims?,0,0,,1,1
9,0,your own technology business,0,0,,0,0


In [33]:
pwd

'/Users/assoumerredempta/Documents/Fall_2022/Artifact_UI/FeatureExtraction'

In [34]:

# pip install python-Levenshtein
# from Levenshtein import distance as lev
# lev('companies', 'companyies')

In [35]:
print('done')

done
