### Code to connect to Oracle from our local python notebook. 
####This was used in our initial attempt to make Oracle work with pandas before we ran into the memory issues:

import os

import platform

LOCATION = r"C:\Oracle"

os.environ["PATH"] = LOCATION + ";" + os.environ["PATH"]

import cx_Oracle

dsn_tns = cx_Oracle.makedsn('msb-MIS381N.austin.utexas.edu', '1521', sid='orcl') #if needed, place an 'r' before any parameter in order to address any special character such as '\'. 
conn = cx_Oracle.connect(user=r'JAR8892', password='<password to be entered here>', dsn=dsn_tns) #if needed, place an 'r' before any parameter in order to address any special character such as '\'. For example, if your user name contains '\', you'll need to place 'r' before the user name: user=r'User Name'

c = conn.cursor()

c.execute('select * from JAR8892.CATEGORIES') # use triple quotes if you want to spread your query across multiple lines

for row in c:
  print (row[0], '-', row[1]) # this only shows the first two columns. To add an additional column you'll need to add , '-', row[2], etc.

conn.close()

### Code for our Recommendation engine (Existing User)

In [3]:
%sh pip install implicit

In [4]:
dbutils.widgets.removeAll()

In [5]:
user_id = getArgument("Enter your User ID ", "TEXTBOX_STRING")
city = getArgument("Enter your City (San Francisco, New York or Chicago)","TEXTBOX_STRING")

In [6]:
import os
import platform

import implicit
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd

Reading in members file to a spark dataframe

In [8]:
df_members = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv("/FileStore/tables/members.csv")

In [9]:
# selecting required columns and converting t0 pandas dataframe
df_members = df_members.select('member_id', 'city','group_id').toPandas()
df_members.head()

Unnamed: 0,member_id,city,group_id
0,3,New York,490552
1,3,New York,1474611
2,3,New York,1490492
3,3,New York,1515830
4,3,New York,1574965


Reading in groups file to a spark dataframe

In [11]:
df_groups_temp = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv("/FileStore/tables/groups.csv")

df_groups_temp.head()

In [12]:
# selecting required columns and converting to pandas dataframe

df_groups_temp = df_groups_temp.select('category_id','group_id','city','group_name','members', 'rating').toPandas()
df_groups_temp.head()

Unnamed: 0,category_id,group_id,city,group_name,members,rating
0,14,6388,New York,Alternative Health NYC,1440,4.39
1,4,6510,New York,Alternative Energy Meetup,969,4.31
2,26,8458,New York,NYC Animal Rights,2930,4.84
3,29,8940,New York,The New York City Anime Group,5080,4.46
4,26,10104,New York,NYC Pit Bull Group,2097,4.09


Loading categories list to a spark dataframe.

In [14]:
df_categories = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv("/FileStore/tables/categories.csv")

df_categories.head()

In [15]:
#merging categories to groups
df_categories = df_categories.select('category_id','category_name').toPandas()
df_groups = pd.merge(df_groups_temp, df_categories, on='category_id')
df_groups.head()

Unnamed: 0,category_id,group_id,city,group_name,members,rating,category_name
0,14,6388,New York,Alternative Health NYC,1440,4.39,Health & Wellbeing
1,14,54126,New York,Energy Healers NYC,2027,4.83,Health & Wellbeing
2,14,67776,New York,Flourishing Life Meetup,839,4.56,Health & Wellbeing
3,14,111855,New York,Hypnosis & NLP NYC - Update Your Brain,1403,4.69,Health & Wellbeing
4,14,129277,Chicago,The Live Food Chicago Community,1997,4.77,Health & Wellbeing


Function that provides a subset of the members and groups dataframe pertaining to that city

In [17]:
def get_city(city):
    """Take in the acronym of the city and return a dataframe containing only members and groups from that city."""
    
    #defense coding
    if city not in ['SF', 'NYC', 'CHI']:
        raise ValueError('Invalid City')
        
    #separating the members and groups by their corresponding city
   
    #SF
    sf = ['San Francisco', 'san francisco', 'South San Francisco']
    df_sf = df_members[df_members.city.isin(sf)]
    df_sfg = df_groups[df_groups.city.isin(sf)]
    
    #NY
    ny = ['New York', 'West New York', 'New York Mills']
    df_ny = df_members[df_members.city.isin(ny)]
    df_nyg = df_groups[df_groups.city.isin(ny)]
    
    #CHI
    chi = ['Chicago','Chicago Heights','West Chicago','Chicago Ridge','East Chicago','North Chicago','Chicago Park']
    df_chi = df_members[df_members.city.isin(chi)]
    df_chig = df_groups[df_groups.city.isin(chi)]
    
    if city == 'SF':
        return df_sf.merge(df_sfg, on = 'group_id')
    elif city == 'NYC':
        return df_ny.merge(df_nyg, on = 'group_id')
    elif city == 'CHI':
        return df_chi.merge(df_chig, on = 'group_id')

ALS Model training for SF

In [19]:
  df1 = get_city('SF')
  df1['m_code'] = df1['member_id'].astype('category').cat.codes
  df1['g_code'] = df1['group_id'].astype('category').cat.codes
  df1['dummy'] = 1

  print(df1.head())
  
  #create the 2 separate sparse matrix where 1 is item x user and the other user x item
  item_user1 = csr_matrix((df1['dummy'], (df1['g_code'],df1['m_code']))) #required to fit the model
  user_item1 = csr_matrix((df1['dummy'], (df1['m_code'], df1['g_code']))) #used as an input for recommend method
  
  #initializing the ALS model
  model1 = implicit.als.AlternatingLeastSquares(factors = 20, regularization = .01, iterations = 20)

  #preset alpha for the confidence learning rate
  alpha = 15

  #confidence c is defined by 1 + alpha*item_user where item_user is the interaction between an item and user
  #Note: the model takes care of the 1 so any negative preference will be set to a confidence of 1 when fitting
  c_item_user1 = item_user1 * alpha

  #fitting the model using the item_user matrix after it has been multiplied by alpha
  model1.fit(c_item_user1)

ALS Model training for New York

In [21]:
  df2 = get_city('NYC')
  df2['m_code'] = df2['member_id'].astype('category').cat.codes
  df2['g_code'] = df2['group_id'].astype('category').cat.codes
  df2['dummy'] = 1
  
  print(df2.head())
  
  #create the 2 separate sparse matrix where 1 is item x user and the other user x item
  item_user2 = csr_matrix((df2['dummy'], (df2['g_code'],df2['m_code']))) #required to fit the model
  user_item2 = csr_matrix((df2['dummy'], (df2['m_code'], df2['g_code']))) #used as an input for recommend method
  
  #initializing the ALS model
  model2 = implicit.als.AlternatingLeastSquares(factors = 20, regularization = .01, iterations = 20)

  #preset alpha for the confidence learning rate
  alpha = 15

  #confidence c is defined by 1 + alpha*item_user where item_user is the interaction between an item and user
  #Note: the model takes care of the 1 so any negative preference will be set to a confidence of 1 when fitting
  c_item_user2 = item_user2 * alpha

  #fitting the model using the item_user matrix after it has been multiplied by alpha
  model2.fit(c_item_user2)

ALS Model training for Chicago

In [23]:
  df3 = get_city('CHI')
  df3['m_code'] = df3['member_id'].astype('category').cat.codes
  df3['g_code'] = df3['group_id'].astype('category').cat.codes
  df3['dummy'] = 1
  
  print(df3.head())
  
  #create the 2 separate sparse matrix where 1 is item x user and the other user x item
  item_user3 = csr_matrix((df3['dummy'], (df3['g_code'],df3['m_code']))) #required to fit the model
  user_item3 = csr_matrix((df3['dummy'], (df3['m_code'], df3['g_code']))) #used as an input for recommend method

    #initializing the ALS model
  model3 = implicit.als.AlternatingLeastSquares(factors = 20, regularization = .01, iterations = 20)

  #preset alpha for the confidence learning rate
  alpha = 15

  #confidence c is defined by 1 + alpha*item_user where item_user is the interaction between an item and user
  #Note: the model takes care of the 1 so any negative preference will be set to a confidence of 1 when fitting
  c_item_user3 = item_user3 * alpha

  #fitting the model using the item_user matrix after it has been multiplied by alpha
  model3.fit(c_item_user3)

In [24]:
#this confirms that there isnt any members in sf that isnt in a group
print(df1['group_id'].dropna().shape)
print(df2['group_id'].dropna().shape)
print(df3['group_id'].dropna().shape)

Function to recommend.  Gets the dummy_code of the user and city as inputs

In [26]:
def recommend(code_user_id, user_item, city):

  if city == 'san francisco':
    
    recommend = model1.recommend(code_user_id, user_item, N = 10)
    #unpacking recommend (list of tuples) into their own list
    idx = []
    scores = []
    for item in recommend:
        item_id, score = item
        idx.append(item_id)
        scores.append(score)

    #Showing the recommended group names and corresponding categories
    rec = list(df1[df1['g_code'].isin(idx)].loc[:,'group_name'].unique())
    print(df1[df1['g_code'].isin(idx)].loc[:,['group_name','category_name']].drop_duplicates(subset=['group_name', 'category_name']))
    
    return rec

  elif city == 'new york':
      
    recommend = model2.recommend(code_user_id, user_item, N = 10)
    #unpacking recommend (list of tuples) into their own list
    idx = []
    scores = []
    for item in recommend:
        item_id, score = item
        idx.append(item_id)
        scores.append(score)

      #Showing the recommended group names and corresponding categories
    rec = list(df2[df2['g_code'].isin(idx)].loc[:,'group_name'].unique())
    print(df2[df2['g_code'].isin(idx)].loc[:,['group_name','category_name']].drop_duplicates(subset=['group_name', 'category_name']))
    
    return rec

  elif city == 'chicago':
      
    recommend = model3.recommend(code_user_id, user_item, N = 10)
    #unpacking recommend (list of tuples) into their own list
    idx = []
    scores = []
    for item in recommend:
        item_id, score = item
        idx.append(item_id)
        scores.append(score)

      #Showing the recommended group names and corresponding categories
    rec = list(df3[df3['g_code'].isin(idx)].loc[:,'group_name'].unique())
    print(df3[df3['g_code'].isin(idx)].loc[:,['group_name','category_name']].drop_duplicates(subset=['group_name', 'category_name']))
    
    return rec

  else:
    print("Invalid City Name")
  

Calls the recommend function after checking the city of that the user inputed and display the top 10 recommendations.

In [28]:
city = city.lower() #converts city to lower case
user_id = int(user_id) # converts the user id input to integer type

if city == 'san francisco':
  if (df1['member_id']==user_id).sum()>0:
    print("Existing User\n")
    print("\n Top 10 Recommendations\n")
    code_user_id = df1[df1['member_id']==user_id]['m_code'].iloc[0]
    rec = recommend(code_user_id, user_item1, city)
  
elif city == 'new york':
  if (df2['member_id']==user_id).sum()>0:
    print("Existing User\n")
    print("\n Top 10 Recommendations\n")
    code_user_id = df2[df2['member_id']==user_id]['m_code'].iloc[0]
    rec = recommend(code_user_id, user_item2, city)
  
elif city == 'chicago':
  if (df3['member_id']==user_id).sum()>0:
    print("Existing User\n")
    print("\n Top 10 Recommendations\n")
    code_user_id = df3[df3['member_id']==user_id]['m_code'].iloc[0]
    rec = recommend(code_user_id, user_item3, city)

else:
  print("Invalid City Name")
  

Following prints the existing groups of the user

In [30]:
if city == 'san francisco':
  
  #this confirms the recommended groups are not groups that a user is already participating in
  for i in rec:
      if i in df1[df1['member_id'] == user_id]['group_name']:
          print(i)
        
  #groups that this user is in
  
  print(df1[df1['member_id'] == user_id][['group_name','category_name']])
  
elif city == 'new york':

  #this confirms the recommended groups are not groups that a user is already participating in
  for i in rec:
      if i in df1[df1['member_id'] == user_id]['group_name']:
          print(i)
        
  #groups that this user is in
  
  print(df2[df2['member_id'] == user_id][['group_name','category_name']])
  
elif city == 'chicago':
  
  #this confirms the recommended groups are not groups that a user is already participating in
  for i in rec:
      if i in df1[df1['member_id'] == user_id]['group_name']:
          print(i)
        
  #groups that this user is in
  
  print(df3[df3['member_id'] == user_id][['group_name','category_name']])

else:
  print("Invalid City Name")


### Code for New Users: 
###### Part 1: Recommend 10 groups based on the top 3 category choices.
###### The top 5 groups are the 5 groups from category 1 with the most members, next 3 are top 3 groups from category 2 with the most members and the last 2 groups are the top 2 groups from category 3 with the most members in that category

In [32]:
# To test for different users, the city and category_x variables can be manually changed with values to show different recommendations from various cold starts. Sample values have been input and displayed below:

# Please enter your choices in the below fields where we have entered sample values
city = "New York"
city_mask = df_groups['city']==city
city1 = df_groups[city_mask] 
print("Please select your top three choices from the below list of category options in your city: \n")
for ind,i in enumerate(city1['category_name'].unique()):
  print (ind+1, i)

In [33]:
# Copy and paste your top 3 choices from the above options into the samples shown below:
cat1 = df_groups['category_name']=='Tech'
cat2 = df_groups['category_name']=='Career & Business'
cat3 = df_groups['category_name']=='Socializing'

In [34]:
print("Your top 10 recommendations based on the above chosen categories are:")
for ind,i in enumerate(df_groups.loc[city_mask&cat1].sort_values(by='members',ascending=False)['group_name'].head(5).values):
  print (ind+1, i)
for ind,i in enumerate(df_groups.loc[city_mask&cat2].sort_values(by='members',ascending=False)['group_name'].head(3).values):
  print (ind+6, i)
for ind,i in enumerate(df_groups.loc[city_mask&cat3].sort_values(by='members',ascending=False)['group_name'].head(2).values):
  print (ind+9, i)

###### Part 2: The below code from this point is our take on the KNN model to recommend similar categories to the user from the ones that he already has.
###### This is not something that meetup has implemented as a straight up option and thus we feel this can be a huge value add.

In [36]:
# df_members
df_category = pd.merge(df_members, df_groups,how='left', on=('group_id','city'))
df_category.head()

Unnamed: 0,member_id,city,group_id,category_id,group_name,members,rating,category_name
0,3,New York,490552,2.0,NYC Customer Makers & Marketers Group,1711.0,4.67,Career & Business
1,3,New York,1474611,34.0,The NYC Gotham Developers Group,5846.0,4.54,Tech
2,3,New York,1490492,34.0,NY Tech Mixer,4628.0,4.28,Tech
3,3,New York,1515830,34.0,The Product Group,15635.0,4.71,Tech
4,3,New York,1574965,34.0,NYC Tech Talks,3207.0,4.57,Tech


In [37]:
df2 = df_category[['member_id','category_name']]
df2.loc[:,('Dummy')]=1

In [38]:
df=df2.pivot_table(index=['member_id'], 
            columns=['category_name'],values='Dummy').fillna(0)
df.head()

category_name,Arts & Culture,Book Clubs,Career & Business,Cars & Motorcycles,Community & Environment,Dancing,Education & Learning,Fashion & Beauty,Fitness,Food & Drink,Games,Health & Wellbeing,Hobbies & Crafts,LGBT,Language & Ethnic Identity,Lifestyle,Movements & Politics,Movies & Film,Music,New Age & Spirituality,Outdoors & Adventure,Paranormal,Parents & Family,Pets & Animals,Photography,Religion & Beliefs,Sci-Fi & Fantasy,Singles,Socializing,Sports & Recreation,Support,Tech,Writing
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
82,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [39]:
interest=df.ix[6,0:] # this is the member_id for which we are identifying the k closest neighbors to match preference
Frame= df.ix[6:,0:] # The other member information to build the knn model on

#### We have attached the code for our KNN model below, this model will help recommend 5 new categories to new users based on their initial choice of 3 categories. The reason this is commented is that this takes 3-4 hours to compile on the 0.88 cores offered by the databricks community edition. This has been tested on jupyter notebooks and it works perfectly.

In [41]:
# The below code snippet fits the knn model based on the other member data, this can be done on a daily once or weekly basis by the Meetup dev team.

import sklearn
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=5).fit(Frame)

In [42]:
# This code snippet enables getting the k nearest neighbors for the data passed in the interest variable

Near=(nbrs.kneighbors([interest]))
Near

x=list(Near[1:])[0][0][1:5]

a=list()
for y in x:
    a.append(df.index.values[y])
a

The below code will print out the final recommendations based on the closest 5 members groups

Suggest=list()
for z in a:
  print(df[df['member_id']==z]['group_name'])