This notebook extracts all spelling and puncutation features from the set of queries found in SQS.

# Load Libraries

The following block of code loads all libraries needed for this notebook.

In [1]:
import csv
import pickle
import string
import textstat

import pandas as pd
import numpy as np

from langdetect import detect
from spellchecker import SpellChecker
from tqdm import tqdm

# Load Data Sets

This block of code loads the data sets and extracts all unique queries from both.

In [2]:
# allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
# allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
# allQueries = allSessions['query'].tolist() + allSessionsSQS['query'].tolist()
# allQueries = set(allQueries)

# allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
# allQueries = allSessionsSQS['query'].tolist()
# allQueries = np.array(allQueries)
# allQueries

allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
allQueries = allSessionsSQS['query'].tolist()
setQueries = allQueries


In [3]:
allSessionsSQS

Unnamed: 0,query,class,sID
0,becoming a fireman,0,3199
1,hotel in Pocono Mountains,0,2515
2,wedding traditions buddhism,0,2823
3,diversification in hiring,0,3033
4,traiditional swahili recipes,0,3145
...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,1,3256
1501,What is a fox's favorite kind of food?,1,2859
1502,"Show me the movie called ""The Martian""",1,3208
1503,What is the biggest rock found on Mars?,1,2676


In [4]:
len(setQueries)

1505

# Generate Misspelled List

Generates a list of commonly misspelled words by children from the KidSpell data set which is later used.

In [5]:
kidsMispelled = []

count = 0

with open('KidSpell/Web_Search_Lab_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[0])

count = 0
with open('KidSpell/Web_Search_Informal_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[0])
    
count = 0
with open('KidSpell/Essay_Writing_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[1])

kidsMispelled = set(kidsMispelled)

In [6]:
len(kidsMispelled)

1134

# Extract Spelling Features

The following block of code extracts features related to spelling errors and stores them in a dataframe. 

In [7]:
spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []

netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                found = 0
                oneOff = 0
                kidsMis = 0
                for word in misspelled:
                    misspelledWords += ", " + word # -- concatinating the misspelled words
                    if word in kidsMispelled:
                        kidsMis +=1
                    candid = spell.candidates(word) # candidates() displays the set of words that are close to the word entered
#                     print(candid)
#                     print("-------start edits-----------------")
                    edits = spell.edit_distance_1(word)
#                     print(edits)
#                     print("-----------end edits-------------")
                    
                    for can in candid:
                        if can in edits:
                            oneOff += 1
                        break
#                     print(word +' Out')
                oneOffError.append(oneOff)
                spellingError.append(len(misspelled))
                kidsError.append(kidsMis)
#                 misspelledCol.append(misspelledWords)
                
            except:
#                 print(misspelled)
                oneOffError.append(-1)
                spellingError.append(-1)
                kidsError.append(-1)
#                 misspelledCol.append(misspelledWords)

        else:
            spellingError.append(0)
            oneOffError.append(0)
            kidsError.append(0)
#             misspelledCol.append("none")

        pbar.update()
    
spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
spelling['query'] = allQueries
spelling['offByOne'] = oneOffError
spelling['kidsError'] = kidsError
# spelling['ms'] = misspelledCol


100%|██████████| 1505/1505 [00:46<00:00, 32.43it/s]


In [8]:
# # ---- CHECKING ----

# spell = SpellChecker()

# spellingError = []
# oneOffError = []
# kidsError = []
# misspelledCol = []

# netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

# with tqdm(total = len(allQueries) ) as pbar:
#     for query in allQueries:
#         query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
#         website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
#         if not website:
#             misspelledWords = "";
#             try:
#                 lang = detect(query) # -- detects a language 
#                 misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
#                 found = 0
#                 oneOff = 0
#                 kidsMis = 0
#                 for word in misspelled:
#                     misspelledWords += " " + word # -- concatinating the misspelled words
#                     if word in kidsMispelled:
#                         kidsMis +=1
#                     candid = spell.candidates(word) # --- candidates() displays the set of words that are close to the word entered e.g: 'country', 'count', 'counter' if country is entered
#                     edits = spell.edit_distance_1(word) # -- Compute all strings that are one edit away from `word` 
                    
#                     for can in candid:
#                         if can in edits:
#                             oneOff += 1 # -- one letter off 
#                         break

#                 oneOffError.append(oneOff)
#                 spellingError.append(len(misspelled))
#                 kidsError.append(kidsMis)
#                 misspelledCol.append(misspelledWords)
                
#             except:
# #                 print(misspelled)
#                 oneOffError.append(-1)
#                 spellingError.append(-1)
#                 kidsError.append(-1)
#                 misspelledCol.append(misspelledWords)

#         else:
#             spellingError.append(0)
#             oneOffError.append(0)
#             kidsError.append(0)
#             misspelledCol.append("none")

#         pbar.update()
    
# spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
# spelling['query'] = allQueries
# spelling['offByOne'] = oneOffError
# spelling['kidsError'] = kidsError
# spelling['ms'] = misspelledCol


In [9]:
jj= 'ths is intentioal'

print(detect(jj))

kk=spell.unknown(jj.split(" "))
kk

en


{'intentioal', 'ths'}

In [10]:
candid = spell.candidates('ths') # candidates() displays the set of words that are close to the word entered
#                     print(candid)
#                     print("-------start edits-----------------")
edits = spell.edit_distance_1('ths')
#                     print(edits)
#                     print("-----------end edits-------------")
oneOff=0              
for can in candid:
    if can in edits:
        oneOff += 1
    break
oneOff

1

In [11]:
spell.candidates('ths')

{'tas',
 'tes',
 'tha',
 'thas',
 'the',
 'thes',
 'thi',
 'this',
 'tho',
 'thos',
 'thu',
 'thus',
 'thy',
 'tis',
 'tus'}

In [12]:
for j in range(5):
    print(j)
    if j ==0:
        print(j)
    break

0
0


In [13]:
spell.edit_distance_1('ths')

{"'hs",
 "'ths",
 'ahs',
 'aths',
 'bhs',
 'bths',
 'chs',
 'cths',
 'dhs',
 'dths',
 'ehs',
 'eths',
 'fhs',
 'fths',
 'ghs',
 'gths',
 'hhs',
 'hs',
 'hths',
 'hts',
 'ihs',
 'iths',
 'jhs',
 'jths',
 'khs',
 'kths',
 'lhs',
 'lths',
 'mhs',
 'mths',
 'nhs',
 'nths',
 'ohs',
 'oths',
 'phs',
 'pths',
 'qhs',
 'qths',
 'rhs',
 'rths',
 'shs',
 'sths',
 "t'hs",
 "t's",
 'tahs',
 'tas',
 'tbhs',
 'tbs',
 'tchs',
 'tcs',
 'tdhs',
 'tds',
 'tehs',
 'tes',
 'tfhs',
 'tfs',
 'tghs',
 'tgs',
 'th',
 "th'",
 "th's",
 'tha',
 'thas',
 'thb',
 'thbs',
 'thc',
 'thcs',
 'thd',
 'thds',
 'the',
 'thes',
 'thf',
 'thfs',
 'thg',
 'thgs',
 'thh',
 'thhs',
 'thi',
 'this',
 'thj',
 'thjs',
 'thk',
 'thks',
 'thl',
 'thls',
 'thm',
 'thms',
 'thn',
 'thns',
 'tho',
 'thos',
 'thp',
 'thps',
 'thq',
 'thqs',
 'thr',
 'thrs',
 'ths',
 "ths'",
 'thsa',
 'thsb',
 'thsc',
 'thsd',
 'thse',
 'thsf',
 'thsg',
 'thsh',
 'thsi',
 'thsj',
 'thsk',
 'thsl',
 'thsm',
 'thsn',
 'thso',
 'thsp',
 'thsq',
 'thsr',


In [14]:
'ths' in spell.edit_distance_1('ths')

True

In [15]:
for j in kk:
    print(spell.candidates(j))
#     print(spell.edit_distance_1(j))
    pass

{'intentional'}
{'tha', 'tis', 'thy', 'thi', 'thas', 'thos', 'tus', 'thus', 'thu', 'tes', 'the', 'this', 'tas', 'tho', 'thes'}


In [16]:
'' in kidsMispelled

True

In [17]:
spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []

netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                found = 0
                oneOff = 0
                kidsMis = 0
                for word in misspelled:
                    misspelledWords += ", " + word # -- concatinating the misspelled words
                    if word in kidsMispelled:
                        kidsMis +=1
                    candid = spell.candidates(word) # candidates() displays the set of words that are close to the word entered
#                     print(candid)
#                     print("-------start edits-----------------")
                    edits = spell.edit_distance_1(word)
#                     print(edits)
#                     print("-----------end edits-------------")
                    
                    for can in candid:
                        if can in edits:
                            oneOff += 1
                        break
#                     print(word +' Out')
                oneOffError.append(oneOff)
                spellingError.append(len(misspelled))
                kidsError.append(kidsMis)
#                 misspelledCol.append(misspelledWords)
                
            except:
#                 print(misspelled)
                oneOffError.append(-1)
                spellingError.append(-1)
                kidsError.append(-1)
#                 misspelledCol.append(misspelledWords)

        else:
            spellingError.append(0)
            oneOffError.append(0)
            kidsError.append(0)
#             misspelledCol.append("none")

        pbar.update()
    
spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
spelling['query'] = allQueries
spelling['offByOne'] = oneOffError
spelling['kidsError'] = kidsError
# spelling['ms'] = misspelledCol


100%|██████████| 1505/1505 [00:44<00:00, 34.12it/s]


In [18]:
allQueries[7]

'efficiency of solar panels'

In [19]:
' ' in allQueries

False

In [20]:
for can in candid:
                        if can in edits:
                            oneOff += 1
                        break

In [21]:
spell = SpellChecker()

spellingError = [] # number of misspelled words in the query
oneOffError = []
kidsError = []  # number of erros found in kidsMispelled
misspelledCol = []
allErrors = [] # list of concatenated errors per query


netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']
with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                spellingError.append(len(misspelled))
                
                all_error = ', '.join(misspelled)
                allErrors.append(all_error)
                
                kidsMis = 0
                #oneoff =0
                for word in misspelled:
                    if word in kidsMispelled:
                        kidsMis +=1
                        
                    candid = spell.candidates(word)
                    edits = spell.edit_distance_1(word)
                    
#                     if list(candid)[0] in edits:
#                         oneoff+=1
# #                     for can in candid:
#                         if can in edits:
#                             oneoff+=1
#                         break
                        
                
                kidsError.append(kidsMis)
                #oneOffError.append(oneoff)
            
            except:
                spellingError.append(-1)
                kidsError.append(-1)
                allErrors.append(-1)
                #oneOffError.append(-1)
            
        else:
            spellingError.append(0)
            kidsError.append(0)
            allErrors.append(0)
            #oneOffError.append(0)

        pbar.update()

100%|██████████| 1505/1505 [00:44<00:00, 34.18it/s]


In [22]:
len(allErrors)

1505

In [23]:
df =pd.DataFrame({
    'Query': allQueries,
    'kidsError': kidsError,
    'numSpellingErrors': spellingError,
    'ms': allErrors,
    #'offByOne': oneOffError
})
df

Unnamed: 0,Query,kidsError,numSpellingErrors,ms
0,becoming a fireman,0,0,
1,hotel in Pocono Mountains,0,1,pocono
2,wedding traditions buddhism,0,0,
3,diversification in hiring,0,0,
4,traiditional swahili recipes,0,2,"swahili, traiditional"
...,...,...,...,...
1500,Who plays the bad guy in Star Wars the Horde a...,0,1,awakends
1501,What is a fox's favorite kind of food?,0,1,foxs
1502,"Show me the movie called ""The Martian""",0,0,
1503,What is the biggest rock found on Mars?,0,0,


In [24]:
'jp' in spell.edit_distance_1('jp')

True

In [25]:
df.query('numSpellingErrors > 1')

Unnamed: 0,Query,kidsError,numSpellingErrors,ms
4,traiditional swahili recipes,0,2,"swahili, traiditional"
11,regular phones vs internet phones voip experie...,0,2,"vs, voip"
21,Pocono Mountains things to do,1,2,", pocono"
83,bioinformatics degree SUNY,0,2,"suny, bioinformatics"
88,directions from OKC to the pocono mountains,0,2,"pocono, okc"
...,...,...,...,...
1388,Who plays Kylo Ren from StarWars?,0,2,"starwars, kylo"
1400,How old is Johny enlgish?,0,2,"johny, enlgish"
1431,How long can a polr bear swin under water?,1,2,"polr, swin"
1456,Batman vs superman,1,2,", vs"


In [26]:
len(allQueries)

1505

In [27]:
# spell = SpellChecker()

# spellingError = []
# oneOffError = []
# kidsError = []
# misspelledCol = []


# netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']
# with tqdm(total = len(allQueries) ) as pbar:
#     for query in allQueries:
#         query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
#         website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
#         if not website:
#             misspelledWords = "";
            
#             try:
#                 lang = detect(query) # -- detects a language 
#                 misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                

In [28]:
edits = spell.edit_distance_1(word)

In [29]:
spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []

test = ['it consulting',
 'JP Morgan Chase data',
 'fun violence control jp constitutional']
test.append('we add www and http jp intentionally')
netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

for query in test:
    query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
    website = [mod for mod in netModifiers if(mod in query)]
    
    if not website:
        misspelledWords = "";
        
        lang = detect(query) # -- detects a language 
        print(query)
        misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
        print('length of',len(misspelled))
        found = 0
        oneOff = 0
        kidsMis = 0
        for word in misspelled:
            misspelledWords += " " + word  # -- concatinating the misspelled words
            print('word = ', list(word))
            print(list(misspelledWords))
            if word in kidsMispelled:
                kidsMis +=1
            candid = spell.candidates(word) # --- candidates() displays the set of words that are close to the word entered e.g: 'country', 'count', 'counter' if country is entered
            edits = spell.edit_distance_1(word) # -- Compute all strings that are one edit away from `word` 
                    
            for can in candid:
                if can in edits:
                    oneOff += 1 # -- one letter off 
                break

        oneOffError.append(oneOff)
        #print(oneOffError)
        print('final', misspelled)
        spellingError.append(len(misspelled))
        #print(spellingError)
        kidsError.append(kidsMis)
        #print(kidsError)
        misspelledCol.append(misspelledWords)
        print(misspelledCol)

it consulting
length of 0
final set()
['']
JP Morgan Chase data
length of 1
word =  ['j', 'p']
[' ', 'j', 'p']
final {'jp'}
['', ' jp']
fun violence control jp constitutional
length of 1
word =  ['j', 'p']
[' ', 'j', 'p']
final {'jp'}
['', ' jp', ' jp']


In [30]:
'' + ' ' + 'A'

' A'

In [31]:
website

['www', 'http']

In [32]:
# ---- CHECKING ----

spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []
misspelledCol = []

netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation)) # -- remove all panctuations
        website = [mod for mod in netModifiers if(mod in query)] # --- save the netModifiers in the queries
        if not website:
            misspelledWords = "";
            try:
                lang = detect(query) # -- detects a language 
                misspelled = spell.unknown(query.split(" ")) # -- unknown gives the misspelled words
                found = 0
                oneOff = 0
                kidsMis = 0
                for word in misspelled:
                    misspelledWords += " " + word # -- concatinating the misspelled words
                    
                    misspelledWords = ' '.join(misspelledWords.split())
                    
                    if word in kidsMispelled:
                        kidsMis +=1
                    candid = spell.candidates(word) # --- candidates() displays the set of words that are close to the word entered e.g: 'country', 'count', 'counter' if country is entered
                    edits = spell.edit_distance_1(word) # -- Compute all strings that are one edit away from `word` 
                    
                    for can in candid:
                        if can in edits:
                            oneOff += 1 # -- one letter off 
                        break

                oneOffError.append(oneOff)
                print('mispelled', misspelled)
                spellingError.append(len(misspelled))
                kidsError.append(kidsMis)
                misspelledCol.append(misspelledWords)
                
            except:
#                 print(misspelled)
                oneOffError.append(-1)
                spellingError.append(-1)
                kidsError.append(-1)
                misspelledCol.append(misspelledWords)

        else:
            spellingError.append(0)
            oneOffError.append(0)
            kidsError.append(0)
            misspelledCol.append("none")

        pbar.update()
    
spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
spelling['query'] = allQueries
spelling['offByOne'] = oneOffError
spelling['kidsError'] = kidsError
# spelling['ms'] = misspelledCol


  0%|          | 0/1505 [00:00<?, ?it/s]

mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()


  0%|          | 5/1505 [00:00<00:37, 40.16it/s]

mispelled {'swahili', 'traiditional'}
mispelled {'boolywood'}
mispelled set()
mispelled set()
mispelled {'bollywoods'}
mispelled set()
mispelled set()
mispelled {'vs', 'voip'}
mispelled set()
mispelled set()
mispelled set()


  1%|          | 16/1505 [00:00<00:27, 54.83it/s]

mispelled set()
mispelled {'phd'}


  1%|▏         | 22/1505 [00:00<00:28, 51.32it/s]

mispelled {'swahili'}
mispelled set()
mispelled {'merck'}
mispelled {'', 'pocono'}
mispelled {''}
mispelled set()
mispelled set()


  2%|▏         | 35/1505 [00:00<00:40, 36.72it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}


  3%|▎         | 40/1505 [00:01<00:43, 33.79it/s]

mispelled set()
mispelled {'voip'}
mispelled set()
mispelled {'voip'}
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'whre'}
mispelled set()
mispelled set()
mispelled {'piigs'}
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled {'dutta'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  4%|▍         | 60/1505 [00:01<00:25, 56.04it/s]

mispelled {'swahili'}
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}


  5%|▍         | 69/1505 [00:01<00:28, 50.35it/s]

mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'sillicon'}
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled set()


  5%|▌         | 82/1505 [00:01<00:31, 45.34it/s]

mispelled set()


  6%|▌         | 94/1505 [00:02<00:37, 37.26it/s]

mispelled {''}
mispelled {'hydoelectric'}
mispelled {'merck'}
mispelled set()
mispelled {'pocono', 'okc'}
mispelled set()
mispelled {'kursk'}
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  7%|▋         | 99/1505 [00:02<00:58, 24.01it/s]

mispelled {'nyc'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  8%|▊         | 120/1505 [00:03<00:36, 37.72it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


  9%|▉         | 142/1505 [00:03<00:27, 49.20it/s]

mispelled {'rheumatology'}
mispelled set()
mispelled {'voip'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'dutta'}
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}


 10%|▉         | 150/1505 [00:04<00:38, 35.50it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled {'pagents'}
mispelled {'voip'}


 11%|█         | 164/1505 [00:04<00:38, 34.88it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled {'countrys'}
mispelled {''}


 11%|█         | 169/1505 [00:04<00:46, 28.89it/s]

mispelled set()


 12%|█▏        | 182/1505 [00:05<00:59, 22.18it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'exsists'}
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled

 12%|█▏        | 186/1505 [00:05<00:57, 22.97it/s]

 set()
mispelled set()
mispelled {'', 'swahili'}
mispelled set()
mispelled {'connecticu'}


 13%|█▎        | 194/1505 [00:06<00:52, 25.07it/s]

mispelled set()
mispelled set()
mispelled {'jp'}
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 13%|█▎        | 199/1505 [00:06<00:44, 29.37it/s]

mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()


 14%|█▍        | 217/1505 [00:06<00:29, 43.37it/s]

mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled {''}
mispelled set()


 15%|█▍        | 223/1505 [00:06<00:38, 33.67it/s]

mispelled {'tradions'}
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled {'voip'}


 16%|█▌        | 239/1505 [00:06<00:26, 47.88it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled {'kursk'}
mispelled {'pocono'}
mispelled {'quiting'}
mispelled set()
mispelled set()


 17%|█▋        | 250/1505 [00:07<00:32, 39.15it/s]

mispelled {'amt'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()


 17%|█▋        | 255/1505 [00:07<00:34, 36.47it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled {'ncaa'}
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()


 18%|█▊        | 270/1505 [00:08<00:41, 29.46it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'jpmorgan'}
mispelled {'attractios'}
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled {'temparature'}
mispelled set()
mispelled set()


 19%|█▉        | 290/1505 [00:08<00:29, 40.90it/s]

mispelled {'suny'}
mispelled {'voip'}
mispelled {'cluture'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'shinto'}
mispelled {'jp'}
mispelled {'voip'}


 20%|█▉        | 298/1505 [00:08<00:26, 45.45it/s]

mispelled {'hobart'}
mispelled set()
mispelled set()


 20%|██        | 304/1505 [00:08<00:34, 34.67it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled {'anc', 'merck'}
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}


 21%|██        | 319/1505 [00:09<00:32, 36.41it/s]

mispelled set()
mispelled {'swahili', 'traiditional'}
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled {'dutta'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled

 22%|██▏       | 332/1505 [00:09<00:25, 46.51it/s]

 {'phd'}
mispelled set()
mispelled {'kursk'}
mispelled set()


 23%|██▎       | 350/1505 [00:09<00:20, 56.46it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'voip'}
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled set()


 24%|██▎       | 357/1505 [00:10<00:42, 26.77it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 24%|██▍       | 367/1505 [00:10<00:32, 35.30it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled {'phd'}
mispelled set()


 25%|██▍       | 375/1505 [00:10<00:36, 31.36it/s]

mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()


 26%|██▌       | 387/1505 [00:11<00:36, 30.83it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled {'kursk'}
mispelled set()


 26%|██▌       | 392/1505 [00:11<00:35, 31.24it/s]

mispelled {'tradions'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 27%|██▋       | 404/1505 [00:11<00:38, 28.41it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'miracet'}
mispelled set()
mispelled set()
mispelled set()


 28%|██▊       | 415/1505 [00:12<00:30, 35.97it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'healty'}
mispelled {'swahili'}
mispelled set()


 28%|██▊       | 426/1505 [00:12<00:22, 47.56it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'besy'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'chary'}
mispelled set()
mispelled

 29%|██▉       | 433/1505 [00:12<00:23, 45.13it/s]

 set()
mispelled set()
mispelled set()


 29%|██▉       | 439/1505 [00:12<00:29, 35.68it/s]

mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'voip'}
mispelled set()
mispelled set()
mispelled {''}


 30%|███       | 455/1505 [00:13<00:28, 36.52it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'', 'suny', 'internationsl'}
mispelled {'voip'}
mispelled set()


 31%|███       | 470/1505 [00:13<00:26, 38.81it/s]

mispelled {'swahili'}
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 32%|███▏      | 479/1505 [00:14<00:36, 28.07it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 33%|███▎      | 492/1505 [00:14<00:25, 40.36it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kenyan'}
mispelled set()
mispelled set()
mispelled {'pocono'}


 33%|███▎      | 498/1505 [00:14<00:25, 39.30it/s]

mispelled {'legislations', 'merck'}
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}


 34%|███▍      | 512/1505 [00:14<00:27, 36.65it/s]

mispelled set()
mispelled set()
mispelled {'asain'}
mispelled set()
mispelled set()
mispelled {'', 'jp'}
mispelled {'academyt'}
mispelled set()
mispelled set()
mispelled set()


 34%|███▍      | 517/1505 [00:15<00:25, 38.64it/s]

mispelled {'kursk'}
mispelled set()


 35%|███▍      | 522/1505 [00:15<00:31, 31.01it/s]

mispelled set()
mispelled {'kwh'}
mispelled set()
mispelled set()
mispelled set()


 35%|███▍      | 526/1505 [00:15<00:41, 23.71it/s]

mispelled set()
mispelled {''}
mispelled set()
mispelled {'anc', 'merck'}
mispelled set()
mispelled set()
mispelled {'vs', 'phd'}
mispelled set()
mispelled set()
mispelled {'ncaa'}


 36%|███▌      | 541/1505 [00:16<00:36, 26.41it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled {'swahili'}


 36%|███▌      | 545/1505 [00:16<00:47, 20.26it/s]

mispelled {'philophical'}
mispelled {'suny'}
mispelled set()


 37%|███▋      | 555/1505 [00:16<00:35, 26.50it/s]

mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled {'vs', 'jp'}
mispelled {'merck'}
mispelled {'oscas'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'vs'}
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled set()


 38%|███▊      | 567/1505 [00:16<00:22, 42.41it/s]

mispelled {'effieiency'}
mispelled {'pagents'}
mispelled set()
mispelled set()
mispelled {'gs'}
mispelled set()


 38%|███▊      | 579/1505 [00:17<00:27, 33.41it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}


 39%|███▉      | 594/1505 [00:17<00:16, 54.10it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'shinto'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 40%|████      | 608/1505 [00:18<00:25, 34.79it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled {'philedelphia'}
mispelled {'swahili'}
mispelled {'aprport'}
mispelled

 42%|████▏     | 630/1505 [00:18<00:14, 59.57it/s]

 set()
mispelled set()
mispelled {'exsistence'}
mispelled {'kursk'}
mispelled set()
mispelled {'voip'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'b'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 42%|████▏     | 639/1505 [00:18<00:25, 33.90it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 44%|████▍     | 663/1505 [00:19<00:17, 47.95it/s]

mispelled {'assement'}
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled {'', 'dehumidifiers'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()


 45%|████▍     | 672/1505 [00:19<00:28, 28.96it/s]

mispelled {'hobart'}
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled {'swahili', 'cusine'}
mispelled {'kenyan'}
mispelled {'pocono'}


 46%|████▌     | 690/1505 [00:20<00:18, 42.96it/s]

mispelled {'kursk'}
mispelled {'vs'}
mispelled {''}
mispelled {'legislations', 'merck', 'lobbie'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'foos'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'dutta'}
mispelled {'kenyan'}
mispelled set()


 47%|████▋     | 700/1505 [00:20<00:17, 46.28it/s]

mispelled set()
mispelled {'swahili'}


 47%|████▋     | 714/1505 [00:20<00:18, 41.69it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled {'', 'merck'}
mispelled set()
mispelled {'kiursk'}
mispelled set()


 48%|████▊     | 729/1505 [00:20<00:13, 59.02it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()


 49%|████▉     | 737/1505 [00:21<00:15, 50.86it/s]

mispelled {'suny'}
mispelled {'swahili', 'kenyan'}
mispelled set()


 49%|████▉     | 744/1505 [00:21<00:22, 33.97it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'pocono'}


 50%|█████     | 754/1505 [00:21<00:20, 36.91it/s]

mispelled {'dc'}
mispelled {''}


 50%|█████     | 759/1505 [00:21<00:24, 30.93it/s]

mispelled {'merck'}
mispelled set()
mispelled set()


 51%|█████     | 768/1505 [00:22<00:25, 28.98it/s]

mispelled {'telefonas'}
mispelled {'voip'}
mispelled {'kenyan'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled {'voip'}


 52%|█████▏    | 787/1505 [00:22<00:14, 50.95it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'dc'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 53%|█████▎    | 794/1505 [00:22<00:14, 50.53it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'cluture'}


 53%|█████▎    | 800/1505 [00:22<00:18, 38.45it/s]

mispelled {'swahili'}
mispelled set()
mispelled set()


 53%|█████▎    | 805/1505 [00:23<00:31, 22.11it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'diease'}
mispelled set()


 54%|█████▍    | 815/1505 [00:23<00:29, 23.76it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phds'}
mispelled set()
mispelled set()


 55%|█████▌    | 829/1505 [00:24<00:23, 28.42it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kiursk'}
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'vs'}
mispelled set()
mispelled set()


 57%|█████▋    | 852/1505 [00:24<00:16, 40.02it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 57%|█████▋    | 857/1505 [00:25<00:19, 32.40it/s]

mispelled set()
mispelled set()
mispelled {'swahili'}


 58%|█████▊    | 869/1505 [00:25<00:20, 31.74it/s]

mispelled {'jp', 'microsoft'}
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled {'dutta'}


 58%|█████▊    | 880/1505 [00:25<00:17, 35.96it/s]

mispelled {'dishe', 'swahili'}
mispelled {'culp'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}


 59%|█████▊    | 884/1505 [00:25<00:18, 34.40it/s]

mispelled {'recipies', 'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 60%|█████▉    | 897/1505 [00:26<00:17, 34.19it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'v'}


 60%|██████    | 906/1505 [00:26<00:20, 29.55it/s]

mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 61%|██████    | 916/1505 [00:26<00:14, 41.76it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'arther'}
mispelled set()
mispelled set()
mispelled {'merck'}
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled set()


 63%|██████▎   | 941/1505 [00:27<00:10, 53.51it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 63%|██████▎   | 949/1505 [00:27<00:10, 52.89it/s]

mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'pagent'}


 64%|██████▎   | 957/1505 [00:27<00:09, 57.31it/s]

mispelled {'', 'pagents'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'suny'}
mispelled {'vs'}
mispelled set()
mispelled {'jp'}


 64%|██████▍   | 964/1505 [00:27<00:17, 31.47it/s]

mispelled {'suny', 'plattsburgh'}
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 65%|██████▌   | 979/1505 [00:28<00:13, 39.24it/s]

mispelled set()
mispelled {'voip'}
mispelled set()
mispelled {'dehumidifiers'}
mispelled {'suny'}
mispelled {'pocono'}
mispelled {'legistlation', 'merck'}
mispelled set()
mispelled set()
mispelled {'evaulation'}
mispelled set()
mispelled set()


 66%|██████▌   | 995/1505 [00:28<00:11, 45.68it/s]

mispelled {'swahili'}
mispelled {'kursk'}
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled

 67%|██████▋   | 1001/1505 [00:28<00:10, 46.05it/s]

 set()
mispelled {'walmart'}
mispelled {''}
mispelled set()
mispelled set()
mispelled {'ncaa'}
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()


 68%|██████▊   | 1019/1505 [00:29<00:09, 50.84it/s]

mispelled set()
mispelled set()
mispelled {'voip'}
mispelled {'jp', 'compulational'}
mispelled set()
mispelled {'dutta'}
mispelled {'diease'}
mispelled set()
mispelled set()
mispelled set()


 68%|██████▊   | 1025/1505 [00:29<00:09, 50.78it/s]

mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled {'pocono'}
mispelled {'pocono', 'expedia'}


 69%|██████▉   | 1039/1505 [00:29<00:09, 47.85it/s]

mispelled {''}
mispelled set()
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled {'', 'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled set()


 70%|██████▉   | 1050/1505 [00:29<00:07, 59.24it/s]

mispelled set()
mispelled set()
mispelled {'jp'}
mispelled set()
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled {'voip'}
mispelled {'swahili'}
mispelled {'recepies'}


 70%|███████   | 1057/1505 [00:30<00:15, 28.43it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kwh'}
mispelled set()


 71%|███████   | 1062/1505 [00:30<00:16, 27.28it/s]

mispelled {'minortiys'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'ncaa', 'runnerup'}


 71%|███████   | 1067/1505 [00:30<00:20, 21.08it/s]

mispelled {'swahili'}
mispelled set()
mispelled {'vs'}
mispelled

 71%|███████▏  | 1073/1505 [00:31<00:18, 23.48it/s]

 set()
mispelled {'pocono'}
mispelled {'scheduele'}
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 72%|███████▏  | 1088/1505 [00:31<00:11, 35.74it/s]

mispelled {''}
mispelled set()
mispelled {'servies'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'benifits', 'phd'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 73%|███████▎  | 1095/1505 [00:31<00:11, 36.47it/s]

mispelled set()
mispelled {'jpmorgan'}
mispelled set()
mispelled set()


 73%|███████▎  | 1105/1505 [00:31<00:13, 30.09it/s]

mispelled {'roadtrip'}
mispelled {'merck'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'phd'}
mispelled {'', 'jp'}
mispelled set()
mispelled set()


 75%|███████▍  | 1123/1505 [00:32<00:10, 38.01it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'arther'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'swahili'}
mispelled set()
mispelled {'pocono'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'jp'}


 75%|███████▌  | 1135/1505 [00:32<00:10, 33.84it/s]

mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled {''}
mispelled set()
mispelled {'', 'pagents'}
mispelled {'kursk'}
mispelled set()


 77%|███████▋  | 1159/1505 [00:33<00:07, 43.34it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'vs'}
mispelled {'keto'}
mispelled {'suny'}
mispelled {'suny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'cerimonies'}


 78%|███████▊  | 1168/1505 [00:33<00:06, 50.76it/s]

mispelled set()
mispelled set()
mispelled {'suny'}
mispelled set()
mispelled {'kursk'}
mispelled set()
mispelled {'swahili'}
mispelled {'trax'}
mispelled {'autralia'}


 78%|███████▊  | 1180/1505 [00:34<00:11, 29.00it/s]

mispelled {'', 'merck'}
mispelled {'pocono'}
mispelled {'jp'}
mispelled {'swahili'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 79%|███████▉  | 1187/1505 [00:34<00:09, 34.45it/s]

mispelled {'kursk'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'dehumidifiers'}
mispelled {''}
mispelled set()


 79%|███████▉  | 1192/1505 [00:34<00:14, 21.92it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 81%|████████  | 1213/1505 [00:35<00:08, 32.50it/s]

mispelled {'jp'}
mispelled set()
mispelled {'kursk'}
mispelled {'fouris'}
mispelled set()
mispelled set()
mispelled {'fouris'}
mispelled set()
mispelled set()
mispelled {'amozan'}
mispelled {'ferious'}
mispelled {'ferious', 'dominics'}
mispelled {'cheeta'}
mispelled {'ferious'}
mispelled {'ferious'}
mispelled {'ferious'}
mispelled {'2nd'}
mispelled

 81%|████████  | 1220/1505 [00:35<00:07, 36.37it/s]

 {'edwads'}
mispelled set()
mispelled set()
mispelled set()


 81%|████████▏ | 1225/1505 [00:35<00:08, 33.82it/s]

mispelled {'langenges'}
mispelled {'ttm'}
mispelled set()
mispelled {'23rd'}
mispelled set()


 82%|████████▏ | 1237/1505 [00:36<00:07, 35.82it/s]

mispelled {'pintrest'}
mispelled set()
mispelled set()
mispelled {'leamer'}
mispelled {'taler'}
mispelled set()
mispelled {'poler'}
mispelled {'ioin', 'whoh'}
mispelled {'chetta'}


 83%|████████▎ | 1242/1505 [00:36<00:08, 31.73it/s]

mispelled {'florider'}
mispelled {'tranksformers', 'moviey'}
mispelled {'comeing'}
mispelled {'lity'}
mispelled set()
mispelled {'armer'}
mispelled {'esan'}
mispelled set()


 83%|████████▎ | 1251/1505 [00:36<00:07, 32.27it/s]

mispelled {'selelena'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'whitee'}
mispelled set()
mispelled set()
mispelled {'comieng'}


 84%|████████▎ | 1260/1505 [00:36<00:08, 29.01it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'extirced'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'facrs'}
mispelled set()
mispelled set()


 85%|████████▍ | 1275/1505 [00:37<00:09, 23.38it/s]

mispelled {'jurrasic', 'wortl'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'baymax'}
mispelled set()
mispelled {'macdre'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 86%|████████▌ | 1294/1505 [00:38<00:05, 39.48it/s]

mispelled {'feey'}
mispelled {'puth'}
mispelled set()
mispelled set()
mispelled {'skarch'}
mispelled set()
mispelled set()
mispelled {'talor'}
mispelled {'sward'}
mispelled {'30th'}
mispelled set()
mispelled set()
mispelled {'moive', 'huray', 'larax'}
mispelled set()
mispelled set()
mispelled {'furiouse'}
mispelled set()
mispelled {'hallows'}
mispelled set()
mispelled {'giraf'}
mispelled set()


 87%|████████▋ | 1304/1505 [00:38<00:05, 36.31it/s]

mispelled {'knoledg', 'scintest'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 87%|████████▋ | 1316/1505 [00:38<00:05, 35.15it/s]

mispelled set()
mispelled {'seahorces'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'newst', 'gfazzy'}
mispelled set()


 88%|████████▊ | 1325/1505 [00:39<00:05, 31.14it/s]

mispelled {'f', 'jonh', 'kennadys'}
mispelled {'', 'ttm'}
mispelled set()
mispelled {'bb8'}
mispelled {'wgat'}
mispelled {'weeknds'}
mispelled {'starwars'}
mispelled set()
mispelled set()


 89%|████████▊ | 1332/1505 [00:39<00:04, 35.02it/s]

mispelled set()
mispelled set()
mispelled {'draffs'}
mispelled set()
mispelled set()
mispelled {'stanly', 'supperre'}
mispelled set()
mispelled {'explosin'}


 89%|████████▉ | 1340/1505 [00:39<00:05, 29.61it/s]

mispelled {'flote', 'anabell'}
mispelled {'poler'}
mispelled set()
mispelled {'anabell'}
mispelled {'comeing', 'vingis'}
mispelled {'youtub'}
mispelled {'insidous'}
mispelled {'chronicels', 'narnia'}


 90%|████████▉ | 1347/1505 [00:39<00:04, 35.55it/s]

mispelled set()
mispelled {'sissors'}
mispelled {'cs'}
mispelled set()
mispelled {'propums', 'envirormental'}


 90%|█████████ | 1355/1505 [00:40<00:04, 30.92it/s]

mispelled {'buiolt', 'm', 'donlads'}
mispelled {'kylo'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'caculator'}
mispelled

 91%|█████████ | 1365/1505 [00:40<00:03, 38.62it/s]

 set()
mispelled {'terminater'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 91%|█████████ | 1373/1505 [00:40<00:03, 33.14it/s]

mispelled {'starwars', 'chewey'}
mispelled set()
mispelled {'mincraft'}
mispelled set()
mispelled set()
mispelled {'mamoth'}


 92%|█████████▏| 1382/1505 [00:40<00:04, 25.45it/s]

mispelled {'anebelle'}
mispelled set()
mispelled {'beiber'}
mispelled set()
mispelled set()
mispelled {'lukes'}
mispelled set()
mispelled {'starwars'}


 92%|█████████▏| 1388/1505 [00:41<00:04, 24.66it/s]

mispelled {'dj'}
mispelled set()
mispelled {'anibelle'}
mispelled set()
mispelled set()


 92%|█████████▏| 1391/1505 [00:41<00:04, 24.09it/s]

mispelled {'starwars', 'kylo'}
mispelled {'coll'}
mispelled set()
mispelled set()
mispelled {'feathres'}
mispelled set()
mispelled set()
mispelled {'furiouse'}
mispelled set()
mispelled {'furiouse'}
mispelled set()
mispelled set()
mispelled {'johny', 'enlgish'}


 93%|█████████▎| 1402/1505 [00:41<00:03, 31.45it/s]

mispelled {'johny'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'5th'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'incredibles'}


 95%|█████████▍| 1426/1505 [00:42<00:01, 44.41it/s]

mispelled {'mrincredible'}
mispelled set()
mispelled {'incredibles'}
mispelled set()
mispelled {'freddys'}
mispelled {'gorden'}
mispelled set()
mispelled {'kylo'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'sumdog'}
mispelled {'antman'}
mispelled set()


 96%|█████████▌| 1438/1505 [00:42<00:01, 44.19it/s]

mispelled {'antman'}
mispelled {'v'}
mispelled set()
mispelled set()
mispelled {'polr', 'swin'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'dvd'}


 96%|█████████▋| 1451/1505 [00:42<00:01, 51.41it/s]

mispelled {'bautey'}
mispelled set()
mispelled {'cheeta'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'krois'}
mispelled set()
mispelled {'lirycs'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'', 'vs'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()


 97%|█████████▋| 1463/1505 [00:42<00:00, 60.43it/s]

mispelled {'ohohoh'}
mispelled {'starwars'}


 98%|█████████▊| 1470/1505 [00:43<00:00, 37.11it/s]

mispelled {'starwars'}
mispelled set()
mispelled set()
mispelled set()
mispelled {'girrafe'}
mispelled set()


 98%|█████████▊| 1475/1505 [00:43<00:00, 30.67it/s]

mispelled set()
mispelled set()
mispelled set()
mispelled {'starwars'}
mispelled set()
mispelled set()
mispelled set()


100%|██████████| 1505/1505 [00:43<00:00, 34.45it/s]

mispelled set()
mispelled set()
mispelled {'', 'bagan', 'billbow'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'mutan'}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'kylo'}
mispelled set()
mispelled {''}
mispelled set()
mispelled set()
mispelled set()
mispelled set()
mispelled {'awakends'}
mispelled {'foxs'}
mispelled set()
mispelled set()
mispelled set()





In [33]:
spelling.head(15)

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError
0,0,becoming a fireman,0,0
1,1,hotel in Pocono Mountains,1,0
2,0,wedding traditions buddhism,0,0
3,0,diversification in hiring,0,0
4,2,traiditional swahili recipes,1,0
5,1,Boolywood in hollywood,1,0
6,0,week long vacation idea,0,0
7,0,efficiency of solar panels,0,0
8,1,bollywood's increasing popularity,1,0
9,0,New York City,0,0


In [34]:
cc = 'You hate a ddog'
m = spell.unknown(cc.split(" "))
m

{'ddog'}

In [35]:
dd = 'how god exsists'
m = spell.unknown(dd.split(" "))
m

{'exsists'}

In [36]:
spelling

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError
0,0,becoming a fireman,0,0
1,1,hotel in Pocono Mountains,1,0
2,0,wedding traditions buddhism,0,0
3,0,diversification in hiring,0,0
4,2,traiditional swahili recipes,1,0
...,...,...,...,...
1500,1,Who plays the bad guy in Star Wars the Horde a...,1,0
1501,1,What is a fox's favorite kind of food?,1,0
1502,0,"Show me the movie called ""The Martian""",0,0
1503,0,What is the biggest rock found on Mars?,0,0


In [37]:
spelling['kidsError'].unique()

array([ 0, -1,  1])

In [38]:
# spelling.ms[631]

In [39]:
spelling[spelling['kidsError']==1]

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError
21,2,Pocono Mountains things to do,2,1
22,1,Connecticut Fire Academy,1,1
76,1,can a person continue sport after getting col...,1,1
84,1,what is collagen disease,1,1
110,1,"homicide "" martin bryant""",1,1
167,1,US government spending +space exploration,1,1
179,1,chateau resort getting to,1,1
184,2,Kenya Swahili dishes,1,1
217,1,employee evaluation tips,1,1
223,1,road trip + vacation,1,1


In [40]:
# len(s.edit_distance_2('world'))

In [41]:
q = "what is the nme of www"
netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']
w = [mod for mod in netModifiers if(mod in q)]
w

['www']

In [42]:
lan = 'oppure'
detect(lan)

'no'

In [43]:
s = SpellChecker()

mis = s.unknown(['countr', 'hapen'])


for w in mis:
    print(w, s.correction(w))
    
    print(s.candidates(w))
 

hapen happen
{'hagen', 'hacen', 'haven', 'happen', 'haben'}
countr country
{'count', 'counts', 'country', 'county', 'counter'}


In [44]:
[w for w in mis]

['hapen', 'countr']

In [45]:
print(word)

foxs


In [46]:
d = 'Ibra, and Micheal!' 
print(d.translate(str.maketrans('', '', string.punctuation)))

Ibra and Micheal


# Extract Punctuation And Casing Features

The following block of code extracts spelling and casing features before adding them to the dataframe. 

In [47]:
invalidcharacters= set(['!', ',', '.', '?'])
punct = []
casing = []
with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:

        if any(char in invalidcharacters for char in query):
            if any(substring in query for substring in netModifiers):
                punct.append(0)
            else:
                punct.append(1)
        else:
            punct.append(0)

        if query.islower():
            casing.append(0)
        else:
            casing.append(1)
        pbar.update()
        
spelling['punct'] = punct
spelling['casing'] = casing



100%|██████████| 1505/1505 [00:00<00:00, 475512.43it/s]


In [48]:
spelling

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError,punct,casing
0,0,becoming a fireman,0,0,0,0
1,1,hotel in Pocono Mountains,1,0,0,1
2,0,wedding traditions buddhism,0,0,0,0
3,0,diversification in hiring,0,0,0,0
4,2,traiditional swahili recipes,1,0,0,0
...,...,...,...,...,...,...
1500,1,Who plays the bad guy in Star Wars the Horde a...,1,0,1,1
1501,1,What is a fox's favorite kind of food?,1,0,1,1
1502,0,"Show me the movie called ""The Martian""",0,0,0,1
1503,0,What is the biggest rock found on Mars?,0,0,1,1


# Return Feature Set

Returns dataframe with spelling and punctuation features.

In [49]:
pickle.dump(spelling, open( "Pickles/SPFeat.p", "wb" ) )

In [50]:
spelling.head(30)

Unnamed: 0,numSpellingErrors,query,offByOne,kidsError,punct,casing
0,0,becoming a fireman,0,0,0,0
1,1,hotel in Pocono Mountains,1,0,0,1
2,0,wedding traditions buddhism,0,0,0,0
3,0,diversification in hiring,0,0,0,0
4,2,traiditional swahili recipes,1,0,0,0
5,1,Boolywood in hollywood,1,0,0,1
6,0,week long vacation idea,0,0,0,0
7,0,efficiency of solar panels,0,0,0,0
8,1,bollywood's increasing popularity,1,0,0,0
9,0,New York City,0,0,0,1


In [51]:
spelling.shape

(1505, 6)

In [52]:
spelling.columns

Index(['numSpellingErrors', 'query', 'offByOne', 'kidsError', 'punct',
       'casing'],
      dtype='object')

In [53]:
print('done')

done


In [54]:
# expected shape: (1505, 6)