### Meetup.com Recommendation Systems
### Data Pre-processing 

In [1]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import math
from datetime import datetime

#### Load the Datasets

In [2]:
members = pd.read_csv('members_new.csv',encoding='latin-1')
rsvp = pd.read_csv('rsvps_all_new.csv',encoding='latin-1')
events  = pd.read_csv('events_all_new.csv',encoding='latin-1')
groups = pd.read_csv('groups_austin.csv',encoding='latin-1')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
events['id'].nunique()

35629

In [3]:
## Set to display all the columns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [6]:
members.nunique()

Unnamed: 0                              31362
Unnamed: 0.1                            31362
bio                                    121753
city                                     4503
country                                   138
group_id                                 2273
hometown                                 9047
id                                     299630
joined                                1157139
lang                                        1
lat                                      2947
link                                   299189
lon                                      4630
name                                   220975
other_services.facebook.identifier      14306
other_services.flickr.identifier          675
other_services.linkedin.identifier       1436
other_services.tumblr.identifier          783
other_services.twitter.identifier        9785
photo.base_url                              1
photo.highres_link                     218872
photo.photo_id                    

#### Members, Groups, Events and RSVP
Get all the required columns from the dataframes

In [44]:
members = members[['group_id','id','name','joined','visited','topics']]
members['id'] = members['id'].astype(int)

In [45]:
members.head()

Unnamed: 0,group_id,id,name,joined,visited,topics
0,10023,179008112,...,1492474000000.0,1492474000000.0,[]
1,10023,4099389,*!* Tonya P. *!*,1552514000000.0,1563500000000.0,"[{'urlkey': 'ballroomdance', 'name': 'Ballroom..."
2,10023,7436497,** Marcia S. **,1213335000000.0,1213853000000.0,[]
3,10023,291431587,1Barret1WemTD,1570863000000.0,1571010000000.0,[]
4,10023,179208612,A,1414026000000.0,1426009000000.0,[]


In [7]:
groups = groups[['category.id','description','id','members','name','rating','topics','urlname','who']]

In [10]:
events = events[['group.id','group.urlname','group.name','group.who','id']]

In [12]:
rsvp = rsvp[['event.id','group.id','member.member_id','response','rsvp_id']]

#### Number of events organized by each group

In [13]:
## Get the number of events in each group
group_events = pd.DataFrame(events.groupby('group.id')["id"].nunique()).reset_index()
group_events = group_events.rename(columns = {'id':'count_of_events'})

#### Number of RSVPs for each member per group
This is the number of events of a particular group that a member has RSVP'd for.

In [14]:
rsvp_data = pd.DataFrame(rsvp.groupby(['group.id','member.member_id'])['rsvp_id'].count())
rsvp_data = rsvp_data.reset_index().rename(columns = {'rsvp_id':'count_of_rsvps'})

#### Merge the event count and RSVP information with the members dataframe

In [15]:
## Merge members and rsvp_data on member_id and group_id to get the RSVP information for each member and group.
members_rsvp = members.merge(rsvp_data, left_on =['group_id','id'],right_on=['group.id','member.member_id'], how='left')
members_rsvp = members_rsvp[['name','group_id',"id","count_of_rsvps"]]

In [17]:
## Merge the new dataset with group_events to get the number of events organized by each group
members_rsvp = members_rsvp.merge(group_events, left_on ='group_id', right_on ='group.id')

In [18]:
members_rsvp["count_of_rsvps"].fillna(0, inplace=True)
members_rsvp["count_of_events"].fillna(0, inplace=True)

#### Calculate the number of events of a group that a member has RSVP'd for by the total number of events organized by the group

In [19]:
members_rsvp["rsvp_for_events"] = members_rsvp["count_of_rsvps"]/members_rsvp["count_of_events"]
members_rsvp = members_rsvp[["group_id","id","rsvp_for_events",'name']]

In [20]:
members_rsvp["rsvp_for_events"].fillna(0, inplace=True)

In [22]:
## Scale this ratio to be between 1 and 10
members_rsvp['rsvp_total'] = 1 + (members_rsvp['rsvp_for_events']-members_rsvp['rsvp_for_events'].min())*(10-1)/(members_rsvp['rsvp_for_events'].max()-members_rsvp['rsvp_for_events'].min())

#### Save this into a .csv file for further processing

In [24]:
members_rsvp.to_csv("members_rsvp.csv")

### Time-delta implicit recommendation
#### Convert the joined and visited date to datetime format

In [53]:
members_delta = members.dropna(subset = ['joined'])

In [55]:
members_delta["joined"] = members_delta["joined"].apply(lambda x: datetime.fromtimestamp(float(x)/1000.))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [57]:
members_delta = members_delta.dropna(subset = ['visited'])

In [58]:
members_delta["visited"] = members_delta["visited"].apply(lambda x: datetime.fromtimestamp(float(x)/1000.))

In [59]:
members_delta.head()

Unnamed: 0,group_id,id,name,joined,visited,topics
0,10023,179008112,...,2017-04-17 19:03:08,2017-04-17 19:03:08,[]
1,10023,4099389,*!* Tonya P. *!*,2019-03-13 16:55:30,2019-07-18 20:29:36,"[{'urlkey': 'ballroomdance', 'name': 'Ballroom..."
2,10023,7436497,** Marcia S. **,2008-06-13 00:30:28,2008-06-19 00:20:30,[]
3,10023,291431587,1Barret1WemTD,2019-10-12 01:49:03,2019-10-13 18:48:15,[]
4,10023,179208612,A,2014-10-22 19:54:05,2015-03-10 12:30:53,[]


In [60]:
members_delta['joined'] = pd.to_datetime(members_delta['joined'])
members_delta['visited'] = pd.to_datetime(members_delta['visited'])

In [61]:
members_delta['delta'] = (members_delta['visited'] - members_delta['joined']).astype('timedelta64[M]')
members_delta['delta'] = members_delta['delta'].replace(to_replace = 0.0, value = 1.0)

In [64]:
## Scale this ratio to be between 1 and 10
members_delta['delta'] = 1 + (members_delta['delta']-members_delta['delta'].min())*(10-1)/(members_delta['delta'].max()-members_delta['delta'].min())

In [65]:
## Get the required columns
members_delta = members_delta[['group_id','id','joined','visited','delta']]

#### Save this into a .csv file for further processing

In [66]:
members_delta.to_csv("members_delta.csv")