# Loading Dataset

In [1]:
import pandas as pd

In [2]:
email_df = pd.read_csv("email.csv")
email_df.head()

Unnamed: 0,primary_email,secondary_email
0,"[""info@exitbeveiliging.nl""]",[]
1,"[""meister@klima-lange.de""]",[]
2,"[""jules@autourduparquet.fr""]",[]
3,"[""hedylapkin@gmail.com""]",[]
4,"[""bucuresti@oltextur.ro""]",[]


# Unpacking Email

In [3]:
def unpack_email(email_data,columns=["primary_email","secondary_email"]):
    email_list = []
    for col in columns:
        for item in email_data[col]:
            email_ = item.strip('][').replace('"',"").split(', ')
            email_list += [email for email in email_ if email != ""]
    return email_list

In [4]:
email_ls = unpack_email(email_df)

In [5]:
email_ls

['info@exitbeveiliging.nl',
 'meister@klima-lange.de',
 'jules@autourduparquet.fr',
 'hedylapkin@gmail.com',
 'bucuresti@oltextur.ro',
 'dennisscheunhage@me.com',
 'info@korthcapital.com',
 'popupmediaoy.fi.p.111616-9@docinbound.com',
 'toimisto@popupmedia.fi',
 'info@susanne-muehlum.de',
 'contato@midiasregionais.com.br',
 'contato@montichelassessoria.com.br',
 'montichel@montichelassessoria.com.br',
 'amy.tx.transcare@gmail.com',
 'info.txmeddevice@gmail.com',
 'mike.transcare@gmail.com',
 'support@adsplm.com',
 'anniekoh.realestate@gmail.com',
 'dpo@propnex.com',
 'enquiry@propnex.com',
 'contact@tamudacultures.org',
 'contact@tamudaculutres.org',
 'romain@yachtingexperience.net',
 'info@enoteca.co.th',
 'support@adsplm.com',
 'va@viktorlissy.com',
 'ad.pearland@isgh.org',
 'mo@brandparents.com',
 'contact@tintrel.com',
 'shanicej10@yahoo.com',
 'careers@edgecloudworks.com',
 'paytocert@edgecloudworks.com',
 'nsunaga@gunma-u.ac.jp',
 'hello@shouldwego.co',
 'townofsterlingclerk@gmai

# Fetch email prefix

In [6]:
import re

In [7]:
def fetch_email_prefix(email_list):
    email_l = []
    email_pattern = re.compile("([\w|\d| ! # $ % & ' * + \- \/ = ? ^ _ ` . { | } ~]+)@.*",re.IGNORECASE)
    for email in email_list:
        prefix = re.search(email_pattern, email)
        if prefix:
            email_l.append(prefix.group(1))
    return email_l

In [8]:
email_prefix_list = fetch_email_prefix(email_ls)

In [9]:
email_prefix_list

['info',
 'meister',
 'jules',
 'hedylapkin',
 'bucuresti',
 'dennisscheunhage',
 'info',
 'popupmediaoy.fi.p.111616-9',
 'toimisto',
 'info',
 'contato',
 'contato',
 'montichel',
 'amy.tx.transcare',
 'info.txmeddevice',
 'mike.transcare',
 'support',
 'anniekoh.realestate',
 'dpo',
 'enquiry',
 'contact',
 'contact',
 'romain',
 'info',
 'support',
 'va',
 'ad.pearland',
 'mo',
 'contact',
 'shanicej10',
 'careers',
 'paytocert',
 'nsunaga',
 'hello',
 'townofsterlingclerk',
 'enquiries',
 'info',
 'hpf',
 'post',
 'dpo',
 'enquiry',
 'iamcatong',
 'support',
 'financeiro',
 'contact',
 'juiceduplashes',
 'katie',
 'dpo',
 'enquiry',
 'luqmanhakimproperties',
 'hi',
 'sales',
 'craftcircus.supplies',
 'dpo',
 'enquiry',
 'maxhuang.projects',
 'info',
 'chris',
 'info',
 'info',
 'info',
 'kadlecek',
 'dpo',
 'enquiry',
 'ykk79000',
 'contact',
 'matthew',
 'thomas',
 'contacto',
 'info',
 'mccullough.brandt',
 'poundsand',
 'info',
 'info',
 'info',
 'pptaf87',
 'help',
 'info',
 'i

In [10]:
email_prefix_df = pd.DataFrame(email_prefix_list,columns=["email prefix"])

In [11]:
email_prefix_df.value_counts()[:50]

email prefix   
info               70353
support             9906
contact             9495
hello               6302
sales               5402
admin               3994
office              2446
contato             2300
kontakt             1876
privacy             1846
mail                1505
contacto            1406
biuro               1023
service              999
enquiries            843
help                 816
customerservice      769
webmaster            749
marketing            708
hola                 704
enteryour            665
dpo                  590
john                 586
shop                 584
post                 573
ventas               566
hi                   542
orders               539
email                523
20info               414
press                391
customercare         361
cs                   351
enquiry              349
mllegeorgesand       347
repairs              330
comercial            300
example              295
team                 278
returns  

In [12]:
email_prefix_df.shape

(302666, 1)

# Genuine Email prefix count

In [13]:
email_prefix_count = email_prefix_df.value_counts().rename_axis('email prefix').reset_index(name='counts')

In [14]:
email_prefix_count

Unnamed: 0,email prefix,counts
0,info,70353
1,support,9906
2,contact,9495
3,hello,6302
4,sales,5402
...,...,...
102948,harjinderkarir,1
102949,haristsoukkas,1
102950,harish.sikka,1
102951,harish.p,1


In [15]:
genuine_email_prefix = email_prefix_count[email_prefix_count['counts']<= 5]

In [16]:
genuine_email_prefix.head()

Unnamed: 0,email prefix,counts
2253,zakirova.alina2011,5
2254,misty,5
2255,utrecht,5
2256,sonja,5
2257,valerie.ridgeway,5


In [25]:
!pip install nltk -q

# Clustering 
 - trying to extend the email prefix with its synonyms in order to perform cluster analysis on those extended textual dataset

In [41]:
common_email_prefix_df = email_prefix_count[email_prefix_count['counts']>= 10]

In [42]:
emails_prefix = common_email_prefix_df['email prefix'].unique()

In [43]:
len(emails_prefix)

1292

In [44]:
emails_prefix

array(['info', 'support', 'contact', ..., 'wspolpraca', 'tj', 'deborah'],
      dtype=object)

In [None]:
import nltk
nltk.download("all")
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/suman/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/suman/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/suman/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/suman/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /home/suman/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatizer.lemmatize("books")