## Imports

In [1]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import regex as re
import pandas as pd
import numpy as np
import random 
import pycountry
np.random.seed = 42
random.seed(42)

pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 10)

## Data Format

In [2]:
mailchimp_data = pd.read_csv('./data/MailChimp cleaned records headers.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 10)
mailchimp_data

Unnamed: 0,Email Address,First Name,Last Name,Board Member,Gender,Chapter,Reunion Year,Country,Degree,MEMBER_RATING,OPTIN_TIME,OPTIN_IP,CONFIRM_TIME,CONFIRM_IP,LATITUDE,LONGITUDE,GMTOFF,DSTOFF,TIMEZONE,CC,REGION,CLEAN_TIME,CLEAN_CAMPAIGN_TITLE,CLEAN_CAMPAIGN_ID,LEID,EUID,NOTES,TAGS


The mailchimp dataset comes from the salesforce dataset. This Cleaned dataset would only include emails that have bounced for one reason or another. It will not include any members who have opted out of the email service or those that are receiving emails without any complications. It is important to note that updating information within salesforce for those who have opted out may re-instantiate the email services so we are specifically working with cleaned/bounced records. Additionally, the datasets created for those who are Subscribed and Unsubscribed will have different column names than the Cleaned dataset here (ie Clean_Time, Clean_campaign_title, ect). Therefore any function created here may not work for those datasets. 

In [3]:
saa_pride_data = pd.read_excel('./data/SAA Pride member reports headings.xlsx')
saa_pride_data.reset_index(inplace = True)
saa_pride_data

Unnamed: 0,index,pref_mail_name,pref_class_year,home_city,home_state_code,home_country,home_phone_area_code,home_phone_number,home_email_address,bus_city,bus_state_code,bus_country,bus_phone_area_code,bus_phone_number,bus_email_address,first_name,last_name,pref_name_sort,email_switch,saa_email_address,gsb_email_address,other_email_address,pref_phone_area_code,pref_phone_number,pref_phone_addr_type,memb_status_desc,short_degree_string,parent_degree_string,short_degree_string_spouse,parent_degree_string_spouse,primary_sort_name,plan_name,primary_ind


The Stanford Alumni Association has it's own dataset that may or may not have additional or more recent data on some of the members. It may also have outdated data. Students are given an email but when they become alumni the email needs to be updated. Whether it is updated to specifically an 'alumni.stanford.edu' address or to another would be at the students discrepancy and isn't always updated.

## Helper Functions

In [5]:
def mapCountry(countryStr):
    if countryStr not in [np.nan, None, 'nan','*', '','N/A','n/a']:
        return pycountry.countries.search_fuzzy(countryStr)[0].alpha_3
    return 'n/a'

In [6]:
def ohe(df, column):    
    for col in column:
        train = df[[col]]
        ohe = OneHotEncoder(sparse=False, handle_unknown="error")
        ohe.fit(train)
        encoded_train = ohe.transform(train)
        col_names = [f"{col}_{f}" for f in ohe.get_feature_names()]
        encoded_train = pd.DataFrame(encoded_train,
                                     columns=col_names, index=df.index)
        df = pd.concat([df, encoded_train], axis=1)
        
    return df

In [7]:
def parseDegreeCol(df,deg_colName='short_degree_string'):
    vectorizer = CountVectorizer()
    
    t = vectorizer.fit_transform(df[deg_colName])
    
    # Remove original degree column from original dataframe
    df = df.drop(columns=[deg_colName])
    
    # Create new vectorized degree columns and concatenate
    deg_df =  pd.DataFrame(t.toarray(),columns=vectorizer.get_feature_names(),index = df.index)
    if 'nan' in deg_df.columns:
        deg_df.drop(columns = 'nan')
    return pd.concat([df,deg_df],axis = 1).fillna(0)

### Pokemon Data

In [10]:
df_saa_original = pd.read_excel('./data/SAA_Pokemon_FakeDB.xlsx')
df_saa_original

Unnamed: 0,pref_mail_name,pref_class_year,home_city,home_state_code,home_country,home_phone_area_code,home_phone_number,home_email_address,bus_city,bus_state_code,bus_country,bus_phone_area_code,bus_phone_number,bus_email_address,first_name,last_name,pref_name_sort,email_switch,saa_email_address,gsb_email_address,other_email_address,pref_phone_area_code,pref_phone_number,pref_phone_addr_type,memb_status_desc,short_degree_string,parent_degree_string,short_degree_string_spouse,parent_degree_string_spouse,primary_sort_name,plan_name,primary_ind
0,,2004.0,Shanghai,,*,,*,,,,,,,,Growlithe,Ice,,,*,,,,,,,,,,,,,
1,,,Madrid,,China,,,weedleg4046@stanfordalumni.org,,,China,,,,Weedle,Grass,,,w.grass5053@alumni.stanford.edu,,,,,,,'82,,,,,,
2,,,Seoul,,Kuwait,,,aerodactyl.electric2974@alumni.stanford.edu,,,Kuwait,,,,Aerodactyl,Electric,,,*,,,,,,,,,,,,,
3,,,London,,,,*,*,,,Japan,,,,Pinsir,Fire,,pinsirfire4582@gmail.com,*,,,,,,,"JD '94, PhD '97",,,,,,
4,,,London,,USA,,775 0678-214,*,,,,,,,Horsea,Ice,,hice7313@stanfordalumni.org,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,,,Boston,MA,Japan,,,*,,,,,,,Smeargle,Electric,,smeargleelectric9444@gmail.com,*,,,,,,,"PhD '96, MBA '87",,,,,,
3996,,2000.0,Shanghai,,,,*,kabutops.steel1285@stanfordalumni.org,,,China,,,,Kabutops,Steel,,k.steel5317@alumni.stanford.edu,,,,,,,,BS '93,,,,,,
3997,,,,,Kuwait,,,*,,,,,,,Slowking,Dragon,,,slowkingd5563@stanfordalumni.org,,,,,,,"PhD '84, JD '93",,,,,,
3998,,,,,Japan,,,larvitar.electric9778@stanfordalumni.org,,,United States,,,,Larvitar,Electric,,l.electric7920@stanfordalumni.org,,,,,,,,"MBA '92, '98, JD '86",,,,,,


# Preprocess SAA dataframe

Preprocessing steps:
1. Filter out columns
2. Impute nans
3. Strip and lower case
4. Retain handles in emails, remove domain.
5. Clean Degree string

## Preprocessing functions

We will be using pipelines and column transformers to streamline our preprocessing workflow on the SAA database. We will be defining preprocessing functions that will be used in our pipeline and Column Transformers.

In [144]:
# Filter necessary columns
def filter_columns(df,columns):
    return df.filter(columns)
filter_cols = FunctionTransformer(filter_columns,kw_args={'columns':main_cols})

#Strip and lower case
def strip_lower(arr):    
    arr = np.char.lower(np.char.strip(arr.astype(str)))
    return arr
standardize_str = FunctionTransformer(strip_lower)

#Remove email domains
def remove_domains(df,columns):
#     email_cols = ['home_email_address', 'bus_email_address', 'email_switch',
#                   'saa_email_address', 'gsb_email_address', 'other_email_address']
    df = df.filter(columns)
    for email in columns:
        df[email] = df[email].apply(removeEmailDomain)
    return df
email_domain = FunctionTransformer(remove_domains,kw_args={'columns':email_cols})

#Clean degree string
def cleanDegreeStr(degree):
    no_nums_puncs = re.sub('[;,\'*0-9]', ' ', degree).strip()
    return re.sub('[ ]+', ' ', no_nums_puncs).strip()

def clean_degrees(arr):
    cleanDegreeStr_vec = np.vectorize(cleanDegreeStr)
    arr = cleanDegreeStr_vec(arr)
    return arr
cleaned_degrees = FunctionTransformer(clean_degrees)



## Subpipes and Column Transformers

In [135]:
# We'll throw these mini-pipelines into our ColumnTransformer

main_cols = ['first_name', 'last_name', 'home_country', 'bus_country']
email_cols = ['home_email_address', 'bus_email_address', 'email_switch',\
              'saa_email_address', 'gsb_email_address', 'other_email_address']

#subpipe that handles our the majority of our columns
subpipe_main= Pipeline(steps=[('get_cols', filter_cols),
                         ('simple_impute', SimpleImputer(strategy='constant', fill_value='n/a')),
                         ('string_out', standardize_str),
#                          ('emails',email_domain)
                         ])

#subpipe that handles our our emails
subpipe_email = Pipeline(steps=[('emails',email_domain),
                         ('simple_impute', SimpleImputer(strategy='constant', fill_value='n/a')),
                         ('string_out', standardize_str)
                         ])

#subpipe that handles degrees owned columns
subpipe_degree = Pipeline(steps=[('simple_impute', SimpleImputer(strategy='constant', fill_value='n/a')),
                                 ('degree',cleaned_degrees),
                                 ('string_out', standardize_str)
                         ])


In [138]:
#Combine subpipes into CT
ct = ColumnTransformer(transformers = [
    ('subpipe_main', subpipe_main, df_saa_original.columns),
    ('subpipe_emails', subpipe_email, df_saa_original.columns),
    ('subpipe_degrees', subpipe_degree, [25] )
])


In [139]:
#Test out our first column transformer
columns = ['first_name', 'last_name','home_country', 'bus_country',\
           'home_email_address', 'bus_email_address', 'email_switch',\
                  'saa_email_address', 'gsb_email_address', 'other_email_address',\
          'short_degree_string']
ct_test_df = pd.DataFrame(ct.fit_transform(df_saa_original),columns=columns)
ct_test_df

Unnamed: 0,first_name,last_name,home_country,bus_country,home_email_address,bus_email_address,email_switch,saa_email_address,gsb_email_address,other_email_address,short_degree_string
0,growlithe,ice,*,,,,,,,,
1,weedle,grass,china,china,weedleg4046,,,w.grass5053,,,
2,aerodactyl,electric,kuwait,kuwait,aerodactyl.electric2974,,,,,,
3,pinsir,fire,,japan,,,pinsirfire4582,,,,jd phd
4,horsea,ice,usa,,,,hice7313,,,,
...,...,...,...,...,...,...,...,...,...,...,...
3995,smeargle,electric,japan,,,,smeargleelectric9444,,,,phd mba
3996,kabutops,steel,,china,kabutops.steel1285,,k.steel5317,,,,bs
3997,slowking,dragon,kuwait,,,,,slowkingd5563,,,phd jd
3998,larvitar,electric,japan,united states,larvitar.electric9778,,l.electric7920,,,,mba jd


# Mail Chimp Dataframe

In [124]:
df_mailchimp_original = pd.read_csv('./data/Fake_MailChimp_cleaned_Pokemon.csv')
df_mailchimp_original

Unnamed: 0,Email Address,First Name,Last Name,Board Member,Gender,Chapter,Reunion Year,Country,Degree,MEMBER_RATING,OPTIN_TIME,OPTIN_IP,CONFIRM_TIME,CONFIRM_IP,LATITUDE,LONGITUDE,GMTOFF,DSTOFF,TIMEZONE,CC,REGION,CLEAN_TIME,CLEAN_CAMPAIGN_TITLE,CLEAN_CAMPAIGN_ID,LEID,EUID,NOTES,TAGS
0,slakoth.normal3945@gmail.com,Slakoth,Normal,False,,Texas,,USA,,,,,,,,,,,,,,,,,,,,
1,e.rock7454@gmail.com,Espeon,Rock,True,F,DC Area,,United States,,,,,,,,,,,,,,,,,,,,
2,rhydonghost7966@alumni.stanford.edu,Rhydon,Ghost,False,M,Bay Area,,USA,MBA,,,,,,,,,,,,,,,,,,,
3,porygong9247@stanfordalumni.org,Porygon,Grass,False,M,Bay Area,,Japan,MS,,,,,,,,,,,,,,,,,,,
4,tangelagrass1376@gmail.com,Tangela,Grass,False,,New England,,United States,,,,,,,,,,,,,,,,,,,,
5,c.electric7518@gmail.com,Chansey,Steel,True,F,Other US,,USA,,,,,,,,,,,,,,,,,,,,
6,blissey.ghost4154@gmail.com,Blissey,Ghost,False,M,New England,,Macao Special Administrative Region of China,,,,,,,,,,,,,,,,,,,,


# Preprocess mailchimp dataframe

In [152]:
# Filter necessary columns
df_mailchimp = df_mailchimp_original.filter(['First Name', 'Last Name', 'Email Address',
                                         'Degree', 'Country'])
df_mailchimp

Unnamed: 0,First Name,Last Name,Email Address,Degree,Country
0,Slakoth,Normal,slakoth.normal3945@gmail.com,,USA
1,Espeon,Rock,e.rock7454@gmail.com,,United States
2,Rhydon,Ghost,rhydonghost7966@alumni.stanford.edu,MBA,USA
3,Porygon,Grass,porygong9247@stanfordalumni.org,MS,Japan
4,Tangela,Grass,tangelagrass1376@gmail.com,,United States
5,Chansey,Steel,c.electric7518@gmail.com,,USA
6,Blissey,Ghost,blissey.ghost4154@gmail.com,,Macao Special Administrative Region of China


## Apply Preprocessing subpipes and CT to mailchimp dataframe

We need to change the columns specified for some of the functions in the subpipe. There might be a less redundant way to do this but for now, we'll proceed.

In [153]:
# We'll throw these mini-pipelines into our ColumnTransformer

main_cols = ['First Name', 'Last Name']
email_cols = ['Email Address']

#subpipe that handles our the majority of our columns
subpipe_main= Pipeline(steps=[('get_cols', FunctionTransformer(filter_columns,kw_args={'columns':main_cols})),
                         ('simple_impute', SimpleImputer(strategy='constant', fill_value='n/a')),
                         ('string_out', standardize_str),
#                          ('emails',email_domain)
                         ])

#subpipe that handles our our emails
subpipe_email = Pipeline(steps=[('emails',FunctionTransformer(remove_domains,kw_args={'columns':email_cols})),
                         ('simple_impute', SimpleImputer(strategy='constant', fill_value='n/a')),
                         ('string_out', standardize_str)
                         ])

In [154]:
ct_mailchimp = ColumnTransformer(transformers = [
    ('subpipe_main', subpipe_main, df_mailchimp.columns),
    ('subpipe_emails', subpipe_email, df_mailchimp.columns),
    ('subpipe_degrees', subpipe_degree, [3] )
])

In [156]:
#Test out our first column transformer
columns = ['First Name', 'Last Name', 'Email Address','Degree']
ct_test_df = pd.DataFrame(ct.fit_transform(df_mailchimp),columns=columns)
ct_test_df

ValueError: A given column is not a column of the dataframe

## Filter necessary columns

In [140]:
# Filter necessary columns
df_mailchimp = df_mailchimp_original.filter(['First Name', 'Last Name', 'Email Address',
                                         'Degree', 'Country'])
df_mailchimp

Unnamed: 0,First Name,Last Name,Email Address,Degree,Country
0,Slakoth,Normal,slakoth.normal3945@gmail.com,,USA
1,Espeon,Rock,e.rock7454@gmail.com,,United States
2,Rhydon,Ghost,rhydonghost7966@alumni.stanford.edu,MBA,USA
3,Porygon,Grass,porygong9247@stanfordalumni.org,MS,Japan
4,Tangela,Grass,tangelagrass1376@gmail.com,,United States
5,Chansey,Steel,c.electric7518@gmail.com,,USA
6,Blissey,Ghost,blissey.ghost4154@gmail.com,,Macao Special Administrative Region of China


## Replace nulls with 'n/a'

In [29]:
df_mailchimp.fillna('n/a',inplace=True)
df_mailchimp

Unnamed: 0,First Name,Last Name,Email Address,Degree,Country
0,Slakoth,Normal,slakoth.normal3945@gmail.com,,USA
1,Espeon,Rock,e.rock7454@gmail.com,,United States
2,Rhydon,Ghost,rhydonghost7966@alumni.stanford.edu,MBA,USA
3,Porygon,Grass,porygong9247@stanfordalumni.org,MS,Japan
4,Tangela,Grass,tangelagrass1376@gmail.com,,United States
5,Chansey,Steel,c.electric7518@gmail.com,,USA
6,Blissey,Ghost,blissey.ghost4154@gmail.com,,Macao Special Administrative Region of China


## Strip and lowercase all names and emails

In [30]:
for col in ['First Name','Last Name','Email Address','Degree']:
    df_mailchimp[col] = df_mailchimp[col].str.lower().str.strip()
df_mailchimp

Unnamed: 0,First Name,Last Name,Email Address,Degree,Country
0,slakoth,normal,slakoth.normal3945@gmail.com,,USA
1,espeon,rock,e.rock7454@gmail.com,,United States
2,rhydon,ghost,rhydonghost7966@alumni.stanford.edu,mba,USA
3,porygon,grass,porygong9247@stanfordalumni.org,ms,Japan
4,tangela,grass,tangelagrass1376@gmail.com,,United States
5,chansey,steel,c.electric7518@gmail.com,,USA
6,blissey,ghost,blissey.ghost4154@gmail.com,,Macao Special Administrative Region of China


## Remove email domain

In [31]:
df_mailchimp['Email Address'] = df_mailchimp['Email Address'].apply(removeEmailDomain)
df_mailchimp

Unnamed: 0,First Name,Last Name,Email Address,Degree,Country
0,slakoth,normal,slakoth.normal3945,,USA
1,espeon,rock,e.rock7454,,United States
2,rhydon,ghost,rhydonghost7966,mba,USA
3,porygon,grass,porygong9247,ms,Japan
4,tangela,grass,tangelagrass1376,,United States
5,chansey,steel,c.electric7518,,USA
6,blissey,ghost,blissey.ghost4154,,Macao Special Administrative Region of China


## Convert Country to 3 digit code

In [32]:
# df_mailchimp.Country = df_mailchimp.Country.apply(mapCountry)
# df_mailchimp

## Clean Degree String

In [33]:
df_mailchimp.Degree = df_mailchimp.Degree.apply(cleanDegreeStr)
df_mailchimp

Unnamed: 0,First Name,Last Name,Email Address,Degree,Country
0,slakoth,normal,slakoth.normal3945,,USA
1,espeon,rock,e.rock7454,,United States
2,rhydon,ghost,rhydonghost7966,mba,USA
3,porygon,grass,porygong9247,ms,Japan
4,tangela,grass,tangelagrass1376,,United States
5,chansey,steel,c.electric7518,,USA
6,blissey,ghost,blissey.ghost4154,,Macao Special Administrative Region of China


Converting to 3-char country code is time consuming. Reserving that conversion after subsetting

# Find Best Match for each MailChimp record

Finding the best match for each mail chimp record is a 3 step process
- Map the MailChimp record to the Stanford Alumni dataframe
- Subset the entire dataframe by the first name
- Calculate the cosine similarity for the resulting subset

Few edge cases to note:
- Subset by just the first name, in case a user has changed their last name but change their emails or contact info
- We check for similarity of the 'bounced' MailChimp email handle against *all* possible email fields
- We check for similarity of the 'bounced' MailChimp country field against *all* possible country fields

The result returned is a dictionary of 
- Dataframe of data in order of score (key: `df`)
- List of index of records in stanford alumni dataframe in order of score (key: `idx`)
- List of cosin similarity scores (key: `score`)

In [34]:
def map_MailChimpData_to_SAA_DF(mc_rec_idx,df_mc=df_mailchimp):
    mc_rec = df_mc.iloc[mc_rec_idx]
    target_dict = {'first_name': mc_rec['First Name'], 
                   'last_name': mc_rec['Last Name'],
                   
                   'home_email_address': mc_rec['Email Address'],
                   'bus_email_address': mc_rec['Email Address'],
                   'email_switch': mc_rec['Email Address'],
                   'saa_email_address': mc_rec['Email Address'],
                   'gsb_email_address': mc_rec['Email Address'],
                   'other_email_address': mc_rec['Email Address'],
                   
                    'home_country': mc_rec['Country'],                   
                    'bus_country': mc_rec['Country'],          
                   
                    'short_degree_string': mc_rec['Degree']
                  
                  }
    df =  pd.DataFrame(target_dict,index=['mc_'+str(mc_rec_idx)])
    return df

In [35]:
map_MailChimpData_to_SAA_DF(0).iloc[0]

first_name                        slakoth
last_name                          normal
home_email_address     slakoth.normal3945
bus_email_address      slakoth.normal3945
email_switch           slakoth.normal3945
saa_email_address      slakoth.normal3945
gsb_email_address      slakoth.normal3945
other_email_address    slakoth.normal3945
home_country                          USA
bus_country                           USA
short_degree_string                   n/a
Name: mc_0, dtype: object

In [156]:
print(df_mailchimp.shape)

(7, 5)


In [47]:
results_dict = {}

for i in range(0,df_mailchimp.shape[0]):
    user_dict={}
    #map the data of a mailchimp record to the format in SAA df
    df_mapped_mc_rec = map_MailChimpData_to_SAA_DF(i)
    
    #subsetting the original SAA df by the first name
    df_saa_subset = df_saa[df_saa['first_name'] == df_mapped_mc_rec.iloc[0]['first_name']]
    
    #Concattenating the 2 dataframes together
    df_mc_and_saa_subset = pd.concat([df_mapped_mc_rec,df_saa_subset], axis = 0)
    
    #Changing degree column to vectorized columns
    df_mc_and_saa_subset = parseDegreeCol(df=df_mc_and_saa_subset,deg_colName='short_degree_string')
    
    #Map the country to their 3 digit codes
    df_mc_and_saa_subset.home_country = df_mc_and_saa_subset.home_country.apply(mapCountry)
    df_mc_and_saa_subset.bus_country = df_mc_and_saa_subset.bus_country.apply(mapCountry)
    
    
#     ohe_df = ohe(df_mc_and_saa_subset, df_mc_and_saa_subset.columns)
#     ohe_df.drop(columns = df_mc_and_saa_subset.columns, inplace = True)
    ohe_df = ohe(df_mc_and_saa_subset, df_mc_and_saa_subset.columns)
    ohe_df.drop(columns = df_mc_and_saa_subset.columns, inplace = True)
    y = np.array(ohe_df.iloc[0])
    y = y.reshape(1,-1)
    cos_sim = cosine_similarity(ohe_df, y)
    cos_sim = pd.DataFrame(data=cos_sim, index=ohe_df.index).sort_values(by=0, ascending=False) #[1:]
    results = list(cos_sim.index)
    results_df = df_mc_and_saa_subset.loc[results]
    username = df_mapped_mc_rec.iloc[0]['first_name'] + df_mapped_mc_rec.iloc[0]['last_name']
    user_dict['idx'] = results
    user_dict['cosine_sim_result'] = results_df
    user_dict['SAA_query_result'] = df_saa_original.iloc[results[1:]]
    results_dict[username] = user_dict
    


In [48]:
results_dict

{'slakothnormal': {'idx': ['mc_0',
   1290,
   1967,
   74,
   2052,
   2353,
   2709,
   3789,
   503,
   776,
   1095,
   1485,
   2326,
   2587],
  'cosine_sim_result':      first_name last_name  home_email_address   bus_email_address  \
  mc_0    slakoth    normal  slakoth.normal3945  slakoth.normal3945   
  1290    slakoth    normal                 n/a                 n/a   
  1967    slakoth     ghost   slakoth.ghost5782                 n/a   
  74      slakoth    normal  slakoth.normal3945                 n/a   
  2052    slakoth     water        slakothw1017                 n/a   
  2353    slakoth    poison                 n/a        s.poison6761   
  2709    slakoth    dragon         s.dragon440                 n/a   
  3789    slakoth     ghost                 n/a                 n/a   
  503     slakoth      rock                 n/a                 n/a   
  776     slakoth    ground                 n/a                 n/a   
  1095    slakoth  fighting                 n/a  

In [219]:
df_saa_original = pd.read_excel('./data/SAA_Pokemon_FakeDB.xlsx')
df_saa_original

Unnamed: 0,pref_mail_name,pref_class_year,home_city,home_state_code,home_country,home_phone_area_code,home_phone_number,home_email_address,bus_city,bus_state_code,bus_country,bus_phone_area_code,bus_phone_number,bus_email_address,first_name,last_name,pref_name_sort,email_switch,saa_email_address,gsb_email_address,other_email_address,pref_phone_area_code,pref_phone_number,pref_phone_addr_type,memb_status_desc,short_degree_string,parent_degree_string,short_degree_string_spouse,parent_degree_string_spouse,primary_sort_name,plan_name,primary_ind
0,,2004.0,Shanghai,,*,,*,,,,,,,,Growlithe,Ice,,,*,,,,,,,,,,,,,
1,,,Madrid,,China,,,weedleg4046@stanfordalumni.org,,,China,,,,Weedle,Grass,,,w.grass5053@alumni.stanford.edu,,,,,,,'82,,,,,,
2,,,Seoul,,Kuwait,,,aerodactyl.electric2974@alumni.stanford.edu,,,Kuwait,,,,Aerodactyl,Electric,,,*,,,,,,,,,,,,,
3,,,London,,,,*,*,,,Japan,,,,Pinsir,Fire,,pinsirfire4582@gmail.com,*,,,,,,,"JD '94, PhD '97",,,,,,
4,,,London,,USA,,775 0678-214,*,,,,,,,Horsea,Ice,,hice7313@stanfordalumni.org,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,,,Boston,MA,Japan,,,*,,,,,,,Smeargle,Electric,,smeargleelectric9444@gmail.com,*,,,,,,,"PhD '96, MBA '87",,,,,,
3996,,2000.0,Shanghai,,,,*,kabutops.steel1285@stanfordalumni.org,,,China,,,,Kabutops,Steel,,k.steel5317@alumni.stanford.edu,,,,,,,,BS '93,,,,,,
3997,,,,,Kuwait,,,*,,,,,,,Slowking,Dragon,,,slowkingd5563@stanfordalumni.org,,,,,,,"PhD '84, JD '93",,,,,,
3998,,,,,Japan,,,larvitar.electric9778@stanfordalumni.org,,,United States,,,,Larvitar,Electric,,l.electric7920@stanfordalumni.org,,,,,,,,"MBA '92, '98, JD '86",,,,,,


In [None]:
results_dict = {}
for i in range(0,df_mailchimp.shape[1]+1):
#     target = mailchimp_poke2.iloc[i]
#     target_dict = {'first_name': [target[1]], 'last_name': [target[2]],\
#                emails[0]: [target[0]],\
#                emails[1]: [target[0]],\
#                emails[2]: [target[0]],\
#                emails[3]: [target[0]],\
#                emails[4]: [target[0]],\
#                emails[5]: [target[0]],\
#                    handles[0]: [target[4]],\
#                    handles[1]: [target[4]],\
#                    handles[2]: [target[4]],\
#                    handles[3]: [target[4]],\
#                    handles[4]: [target[4]],\
#                    handles[5]: [target[4]],\
#                'home_country': [target[3]]}
#     df = pd.DataFrame.from_dict(target_dict)
#     subset_saa = saa_poke2[saa_poke2['first_name'] == df.loc[0,'first_name']] 
    #the 0 is calling for row, so it does not return a series
#     subset_saa_new = pd.concat([df,subset_saa], axis = 0)
    ohe_df = ohe(subset_saa_new, subset_saa_new.columns)
    ohe_df.drop(columns = subset_saa_new.columns, inplace = True)
    y = np.array(ohe_df.iloc[0])
    y = y.reshape(1,-1)
    cos_sim = cosine_similarity(ohe_df, y)
    cos_sim = pd.DataFrame(data=cos_sim, index=ohe_df.index).sort_values(by=0, ascending=False) #[1:]
    results = list(cos_sim.index)
    results_df = subset_saa_new.loc[results]
    results_dict[i] = results_df

In [None]:
# saa_poke2.fillna(value='Not Available', inplace=True)
# emails = ['home_email_address', 'bus_email_address', 'email_switch', 'saa_email_address',\
#          'gsb_email_address', 'other_email_address']
# handles = []
# for x in emails:
#     for i in range(0,saa_poke2.shape[0]):
#         if '@' in saa_poke2[x][i]:
#             saa_poke2[x+'_handle'] = saa_poke2[x].str.split('@').str[0]
#         else:
#             saa_poke2[x+'_handle'] = saa_poke2[x]


## Function

In [None]:
results_dict = {}
for i in range(0,df_mailchimp.shape[1]+1):
    target = mailchimp_poke2.iloc[i]
    target_dict = {'first_name': [target[1]], 'last_name': [target[2]],\
               emails[0]: [target[0]],\
               emails[1]: [target[0]],\
               emails[2]: [target[0]],\
               emails[3]: [target[0]],\
               emails[4]: [target[0]],\
               emails[5]: [target[0]],\
                   handles[0]: [target[4]],\
                   handles[1]: [target[4]],\
                   handles[2]: [target[4]],\
                   handles[3]: [target[4]],\
                   handles[4]: [target[4]],\
                   handles[5]: [target[4]],\
               'home_country': [target[3]]}
    df = pd.DataFrame.from_dict(target_dict)
    subset_saa = saa_poke2[saa_poke2['first_name'] == df.loc[0,'first_name']] 
    #the 0 is calling for row, so it does not return a series
    subset_saa_new = pd.concat([df,subset_saa], axis = 0)
    ohe_df = ohe(subset_saa_new, subset_saa_new.columns)
    ohe_df.drop(columns = subset_saa_new.columns, inplace = True)
    y = np.array(ohe_df.iloc[0])
    y = y.reshape(1,-1)
    cos_sim = cosine_similarity(ohe_df, y)
    cos_sim = pd.DataFrame(data=cos_sim, index=ohe_df.index).sort_values(by=0, ascending=False) #[1:]
    results = list(cos_sim.index)
    results_df = subset_saa_new.loc[results]
    results_dict[i] = results_df

In [None]:
target[4]

In [None]:
subset_saa_new

In [None]:
mailchimp_poke2

In [None]:
name_first = input('First Name: ')

results_dict[mailchimp_poke2[mailchimp_poke2['First Name']
                             == name_first].index[0]].head(5)

These are the 5 most likely matches.

## Next Steps

Next step would be to take a proactive approach to mitigate the number of emails that bounce in the future. We would suggest using the salesforce dataset to check for recent grads to reach out before they lose their student emails and ask for updated contact information and their subsequent plans after graduation. It would be easier to update records proactively when we still have accurate contact information. For subsequent plans after graduation, this would be to keeping location information of members to more accurately send regional events and functions. 