# I. EDA Functions #

In [None]:
import os
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def generate_table_summary(data, data_name):
    print('The number of \'{}\' rows is {}'.format(data_name, data.shape[0]))
    print('')
    print('Column names and data types:')
    print(data.dtypes)

def generate_cum_plot(data, date_col, data_name):
    pd.DataFrame(1, index=data[date_col], columns=[data_name]).sort_index().cumsum().plot()
    plot_title = 'Cumulative Count of {} over Time'.format(data_name)
    plt.title(plot_title)
    plt.xlabel('Date')
    plt.xlabel('Cumulative Count')
    plt.savefig('{}.jpg'.format(plot_title.replace(' ', '').replace('/','')))
    plt.show()
    
def create_group_by_counts(data, 
                           group_by_col, group_by_name,
                           count_col, count_name, 
                           hist_cut_off, bin_numbers):
    group_by_counts = data.groupby(group_by_col)[count_col].count().sort_values(ascending=False)
    print('The number of unique \'{} by {}\' is {}'.format(count_name, group_by_name , len(group_by_counts)))
    print(group_by_counts.head(10))
    
    group_by_counts[group_by_counts < hist_cut_off].hist(bins=bin_numbers)
    plot_title = 'Histogram of {} by {}'.format(group_by_name, count_name)
    plt.title(plot_title)
    plt.xlabel(count_name)
    plt.ylabel(group_by_name)
    plt.savefig('{}.jpg'.format(plot_title.replace(' ', '').replace('/','')))
    plt.show()
    
    return group_by_counts

def collect_actors_activities(actors, actor_id):
    activities = pd.DataFrame(np.concatenate((actors[[actor_id]].merge(
                                                               questions[['questions_author_id', 'questions_date_added']], 
                                                               left_on=actor_id, right_on='questions_author_id', how='inner').values[:,[0,2]],
                                                           actors[[actor_id]].merge(
                                                               answers[['answers_author_id', 'answers_date_added']], 
                                                               left_on=actor_id, right_on='answers_author_id', how='inner').values[:,[0,2]],
                                                           actors[[actor_id]].merge(
                                                               comments[['comments_author_id', 'comments_date_added']], 
                                                               left_on=actor_id, right_on='comments_author_id', how='inner').values[:,[0,2]]), 
                                                          axis=0), 
                                           columns=[actor_id, 'activity_time'])
    activities['activity_date'] = activities['activity_time'].dt.date
    activities_df = activities.groupby(
        [actor_id, 'activity_date'])['activity_time'].count().reset_index().pivot(
        values='activity_time', columns=actor_id, index='activity_date')
    return activities_df

def plot_temporal_active_actors(activities_df, activity_days, actor_name):
    activities_df.rolling(window=activity_days, min_periods=1).sum().count(axis=1).plot()
    plot_title = '{}-day windowed Active {} over Time'.format(activity_days, actor_name)
    plt.title(plot_title)
    plt.xlabel('Date')
    plt.ylabel('Active {}'.format(actor_name))
    plt.savefig('{}.jpg'.format(plot_title.replace(' ', '').replace('/','')))
    plt.show()

# II. Data Tables #

In [None]:
# Input data files are available in the "../input/" directory.
input_dir = "../input"
print("\n".join(os.listdir(input_dir)))

In [None]:
# Load all data tables to explore #

career_village_dir = os.path.join(input_dir,'data-science-for-good-careervillage')

professionals = pd.read_csv(os.path.join(career_village_dir, 'professionals.csv'), parse_dates=True)
students = pd.read_csv(os.path.join(career_village_dir, 'students.csv'))
school_memberships = pd.read_csv(os.path.join(career_village_dir, 'school_memberships.csv'))
groups = pd.read_csv(os.path.join(career_village_dir, 'groups.csv'))
group_memberships = pd.read_csv(os.path.join(career_village_dir, 'group_memberships.csv'))
questions = pd.read_csv(os.path.join(career_village_dir, 'questions.csv'))
answers = pd.read_csv(os.path.join(career_village_dir, 'answers.csv'))
emails = pd.read_csv(os.path.join(career_village_dir, 'emails.csv'))
matches = pd.read_csv(os.path.join(career_village_dir, 'matches.csv'))
comments = pd.read_csv(os.path.join(career_village_dir, 'comments.csv'))
tags = pd.read_csv(os.path.join(career_village_dir, 'tags.csv'))
tag_users = pd.read_csv(os.path.join(career_village_dir, 'tag_users.csv'))
tag_questions = pd.read_csv(os.path.join(career_village_dir, 'tag_questions.csv'))

# Convert string dates to date time objects #

professionals['professionals_date_joined'] = pd.to_datetime(professionals['professionals_date_joined'])
students['students_date_joined'] = pd.to_datetime(students['students_date_joined'])
questions['questions_date_added'] = pd.to_datetime(questions['questions_date_added'])
answers['answers_date_added'] = pd.to_datetime(answers['answers_date_added'])
emails['emails_date_sent'] = pd.to_datetime(emails['emails_date_sent'])
comments['comments_date_added'] = pd.to_datetime(comments['comments_date_added'])

## II.1. Professionals ##

In [None]:
generate_table_summary(professionals, data_name='professionals')
professionals.sample(5)

** Below is a cumulative count of professionals joined the website over time. The number of professionals increases exponentially with a acceleration period starting from 2016. **

In [None]:
generate_cum_plot(professionals, date_col='professionals_date_joined', data_name='Professionals')

** The location distribution of these professionals is more concentrated at large cities across USA such as New York and California. ** 

In [None]:
professionals_location_counts = create_group_by_counts(data=professionals, 
                                                       group_by_col='professionals_location', group_by_name = 'Locations',
                                                       count_col='professionals_id', count_name='Professional Numbers',
                                                       hist_cut_off=50, bin_numbers=50)

** The distribution of these professionals across industries is also skewed. A large portion of them are working in Information Technology,Telecommunications, and Computer Software. **

In [None]:
professionals_industry_counts = create_group_by_counts(data=professionals, 
                                                       group_by_col='professionals_industry', group_by_name = 'Industries',
                                                       count_col='professionals_id', count_name='Professional Numbers',
                                                       hist_cut_off=50, bin_numbers=50)

** Compared to professionals location and industry, the distribution of professionals headlines is the most skewed one. It is more likely that professionals are allowed to list their headlines in a free-text form. **

In [None]:
professionals_headline_counts = create_group_by_counts(data=professionals, 
                                                       group_by_col='professionals_headline', group_by_name = 'Headlines',
                                                       count_col='professionals_id', count_name='Professional Numbers',
                                                       hist_cut_off=50, bin_numbers=50)

** Three plots below explore the number of active professionals over yearly, monthly, and weekly time rolling windows. A professional is active if she or he makes at least one one contribution to the website, i.e. a question, an answer, or a comment. Across all three time rolling windows, CareerVillage enjoys a healthy growth of active professionals starting significantly from 2016. **

In [None]:
professional_activities_df = collect_actors_activities(actors=professionals, actor_id='professionals_id')
professional_activities_df.index = pd.to_datetime(professional_activities_df.index)
plot_temporal_active_actors(professional_activities_df, activity_days= 365, actor_name='Professionals')
plot_temporal_active_actors(professional_activities_df, activity_days= 30, actor_name='Professionals')
plot_temporal_active_actors(professional_activities_df, activity_days= 7, actor_name='Professionals')

## II.2. Students ##

In [None]:
generate_table_summary(students, data_name='students')
students.sample(5)

** Below is a cumulative count of students joined the website over time. Similar to professionals, the number of students increases exponentially with a acceleration period starting from 2016. **

In [None]:
generate_cum_plot(students, date_col='students_date_joined', data_name='Students')

** The location distribution of these students is also more concentrated at some large cities across USA such as New York and California and India such as Bengaluru and Hyderabad. For career advice questions that are country-specific, students in India could be underserved by their country's professionals. The total number of professionals who registered locations in India is 1231. ** 

In [None]:
students_location_counts = create_group_by_counts(data=students, 
                                                  group_by_col='students_location', group_by_name = 'Locations',
                                                  count_col='students_id', count_name='Student Numbers',
                                                  hist_cut_off=50, bin_numbers=50)

In [None]:
print('The number of professionals in India is {}'.format(
    professionals.dropna(subset=['professionals_location']).apply(lambda row: row['professionals_location'].lower().find('india')>=0, axis=1).sum()))

The number of professionals in India is 1231

** Three plots below explore the number of active students over yearly, monthly, and weekly time rolling windows. A student is active if she or he makes at least one one contribution to the website, i.e. a question, an answer, or a comment. The monthly and weekly active students are low but it could be that after joining the website for career advice and collecting relevant information, they move on and focus on building up their careers. **

In [None]:
student_activities_df = collect_actors_activities(actors=students, actor_id='students_id')
student_activities_df.index = pd.to_datetime(student_activities_df.index)
plot_temporal_active_actors(student_activities_df, activity_days= 365, actor_name='Students')
plot_temporal_active_actors(student_activities_df, activity_days= 30, actor_name='Students')
plot_temporal_active_actors(student_activities_df, activity_days= 7, actor_name='Students')

## II.3. School Memberships ##

In [None]:
generate_table_summary(school_memberships, data_name='school memberships')
school_memberships.head(3)

** Not many website members register their school memberships. The number of members in a school on average is relative small. Most of groups have 1 or 2 members. **

In [None]:
school_memberships_counts = create_group_by_counts(data=school_memberships, 
                                                   group_by_col='school_memberships_school_id', group_by_name = 'Schools',
                                                   count_col='school_memberships_user_id', count_name='User Numbers',
                                                   hist_cut_off=50, bin_numbers=50)

## II.4. Groups ##

In [None]:
generate_table_summary(groups, data_name='groups')
groups.head(3)

** The number of groups is low and most of them belong to 'youth program.' **

In [None]:
group_type_counts = create_group_by_counts(data=groups, 
                                           group_by_col='groups_group_type', group_by_name = 'Group Types',
                                           count_col='groups_id', count_name='Group Numbers',
                                           hist_cut_off=50, bin_numbers=50)

## II.5. Group Memberships ##

In [None]:
generate_table_summary(group_memberships, data_name='group memberships')
group_memberships.head(3)

** There are some large groups with more than 30 members. **

In [None]:
group_memberships_counts = create_group_by_counts(data=group_memberships, 
                                                  group_by_col='group_memberships_group_id', group_by_name = 'Groups',
                                                  count_col='group_memberships_user_id', count_name='User Numbers',
                                                  hist_cut_off=200, bin_numbers=100)

## II.6. Questions ##

In [None]:
generate_table_summary(questions, data_name='questions')
questions.sample(3)

** Below is a cumulative count of questions created over time. Similar to the growth of students and professionals, the number of questions increases exponentially with a acceleration period starting from 2016. **

In [None]:
generate_cum_plot(questions, date_col='questions_date_added', data_name='Questions')

** Not all students joining the website ask for career advice. Only 12329 students out of 30971 have at least one question. On average, a student who actively seek for career advice ask 2 questions. However, the distribution is very skewed, top active students can have more than 40 questions. **

In [None]:
print('The average number of questions that a student asks on the website is {}'.format(round(questions.groupby('questions_author_id')['questions_id'].count().mean(), 0)))
questions_authors_counts = create_group_by_counts(data=questions, 
                                                  group_by_col='questions_author_id', group_by_name = 'Users',
                                                  count_col='questions_id', count_name='Question Numbers',
                                                  hist_cut_off=200, bin_numbers=100)

## II.7. Answers ##

In [None]:
generate_table_summary(answers, data_name='answers')
answers.sample(3)

** Below is a cumulative count of answers over time. Matching with the question growth, the number of answers increases exponentially with a acceleration period starting from 2016. **

In [None]:
generate_cum_plot(answers, date_col='answers_date_added', data_name='Answers')

** Each contributing professional on average creates 5 answers. However, the distribution is skewed since some professionals have contributed more than 200 answers. **

In [None]:
print('The average number of answers for each contributing professional {}'.format(round(answers.groupby('answers_author_id')['answers_id'].count().mean(), 0)))
answers_authors_counts = create_group_by_counts(data=answers, 
                                                  group_by_col='answers_author_id', group_by_name = 'Users',
                                                  count_col='answers_id', count_name='Answer Numbers',
                                                  hist_cut_off=200, bin_numbers=100)

** Each question that was responsed has 2 answers on average. However, the distribution is also skewed since some questions have more than 20 answers. **

In [None]:
print('The average number of answers for each responsed question {}'.format(round(answers.groupby('answers_question_id')['answers_id'].count().mean(), 0)))
answers_questions_counts = create_group_by_counts(data=answers, 
                                                  group_by_col='answers_question_id', group_by_name = 'Questions',
                                                  count_col='answers_id', count_name='Answer Numbers',
                                                  hist_cut_off=200, bin_numbers=100)
answers_questions_counts.describe()

** Some students also contribute answers to the website. **

In [None]:
answers_by_professionalss = answers.merge(professionals, left_on='answers_author_id', right_on='professionals_id', how='inner')
print('The percentage of answers from professionals is {}'.format(round(100.0 * answers_by_professionalss.shape[0] / answers.shape[0], 2)))
answers_by_students = answers.merge(students, left_on='answers_author_id', right_on='students_id', how='inner')
print('The percentage of answers from students is {}'.format(round(100.0 * answers_by_students.shape[0] / answers.shape[0], 2)))

** Below we explore some characteristics of members that answered questions from July 2018, our selected test period for recommendation performance discuss later in other notebooks. **

** Around 10% of question and answer data will be used for performance test. **

In [None]:
snap_shot_date = dt.datetime(2018, 7, 1)
print('The snapshot date is {}'.format(snap_shot_date))

questions_july_2018 = questions[questions['questions_date_added'] >= snap_shot_date]
print('The number of questions after the snap shot date is {}'.format(questions_july_2018.shape[0]))
print('Compared to the full data set, the percentage of questions after the snap shot date is {}%'.format(round((100.0 * questions_july_2018.shape[0]) / questions.shape[0], 2)))

answers_july_2018 = answers[answers['answers_date_added'] >= snap_shot_date]
print('The number of answers after the snap shot date is {}'.format(answers_july_2018.shape[0]))
print('Compared to the full data set, the percentage of answers after the snap shot date is {}%'.format(round((100.0 * answers_july_2018.shape[0]) / answers.shape[0], 2)))

** Professionals who contributed answers in the test period are more active than others in the past. This analysis demonstrates the predictive power of rolling window activity counts. **

In [None]:
professional_activities_df_before_july_2018_df = professional_activities_df.iloc[:professional_activities_df.index.get_loc(snap_shot_date)-1]
professional_activities_df_before_july_2018 = pd.DataFrame(
    professional_activities_df_before_july_2018_df.sum(axis=0).values, index=professional_activities_df_before_july_2018_df.columns, columns=['100000_day_activity_count'])
professional_activities_df_before_july_2018['365_day_activity_count'] = professional_activities_df_before_july_2018_df.iloc[-365:].sum(axis=0)
professional_activities_df_before_july_2018['30_day_activity_count'] = professional_activities_df_before_july_2018_df.iloc[-30:].sum(axis=0)
professional_activities_df_before_july_2018.dropna().sort_values(by='100000_day_activity_count', ascending=False).head(10)

activities_answers_july_2018 = answers_july_2018.merge(professional_activities_df_before_july_2018.reset_index(), left_on='answers_author_id', right_on='professionals_id', how='outer')
professionals_with_answers_july_2018 = activities_answers_july_2018.dropna(subset=['answers_id'])[professional_activities_df_before_july_2018.columns]
print('Activities of Professionals with Answers:\n{}'.format(professionals_with_answers_july_2018.mean()))
print('Activities of Professionals Overall:\n{}'.format(professional_activities_df_before_july_2018.mean()))

## II.8. Emails ##

In [None]:
generate_table_summary(emails, data_name='emails')
emails.sample(5)

** [The current tag-based recommendation system](https://www.kaggle.com/c/data-science-for-good-careervillage/discussion/84845#latest-510508) sends questions to professionals through emails. Together with the question growth, the sent emails also increases exponentially over time. As questions keep increasing, the professionals can be overwhelmed by the requests. Therefore, building a good recommediation engine is very important to reduce the numbers of recommendations and emails sent to professionals and keep them engaged. **

In [None]:
generate_cum_plot(emails, date_col='emails_date_sent', data_name='Emails')

** Professionals can register for different email frequency levels. Most of them are on the daily or immediately basis. **

In [None]:
emails.groupby(['emails_frequency_level'])['emails_id'].count().plot.bar()
plt.savefig('volumes_of_each_emails_frequency_level.jpg')

** Since professionals join the website at different times and can register for 3 different email frequency levels, the distribution of the number of emails that each professional receives is also skewed. ** 

In [None]:
emails_recipients_counts = create_group_by_counts(data=emails, 
                                                  group_by_col='emails_recipient_id', group_by_name = 'Users',
                                                  count_col='emails_id', count_name='Email Numbers',
                                                  hist_cut_off=200, bin_numbers=100)

## II.9. Matches ##

** Matches are recommendations sent to professionals in emails. **

In [None]:
generate_table_summary(matches, data_name='matches')
matches.head(3)

** The average number of matches (i.e. recommended questions) in each email is 2.0. Most of them have at most 3 recommendations. **

In [None]:
matches_emails_counts = create_group_by_counts(data=matches, 
                                                  group_by_col='matches_email_id', group_by_name = 'Emails',
                                                  count_col='matches_question_id', count_name='Question Numbers',
                                                  hist_cut_off=200, bin_numbers=100)
print('The average number of recommended questions in each email is {}'.format(round(matches_emails_counts.mean(),0)))
print('The number of emails that have at most 3 recommendations is {}'.format(matches_emails_counts[matches_emails_counts <=3].shape[0]))

** The average number of emails sent for each question is 197.0 **

In [None]:
matches_questions_counts = create_group_by_counts(data=matches, 
                                                  group_by_col='matches_question_id', group_by_name = 'Questions',
                                                  count_col='matches_email_id', count_name='Email Numbers',
                                                  hist_cut_off=200, bin_numbers=100)
print('The average number of emails sent for each question is {}'.format(round(matches_questions_counts.mean(),0)))

** In many inactive days, professionals receive no recommendation emails. Since some professionals register for the immediate email frequency (i.e. the possibility of receiving multiple emails on a day), on an active day a professional could receive 7 questions on average. ** 

In [None]:
ml_data_dir = os.path.join(input_dir,'cv-machine-learning-data-construction')
examples = pd.read_parquet(os.path.join(ml_data_dir,'positive_negative_examples.parquet.gzip'))
examples['emails_date'] = examples['emails_date_sent'].dt.date
matches_per_email = examples.groupby(['answer_user_id', 'emails_date'])['questions_id'].count().reset_index()

In [None]:
print('The average number of questions that a professional could receive on an active day is {}'.format(round(matches_per_email['questions_id'].mean())))
matches_per_email[matches_per_email['questions_id'] <= 50]['questions_id'].hist(bins=50)
plot_title = 'The distribution of questions in emails per professional in an active day'
plt.title(plot_title)
plt.xlabel('Number of questions on an active day')
plt.xlabel('Cumulative Count')
plt.savefig('questions_in_emails_per professional_in_an_active_day.jpg')
plt.show()

## II.10. Comments ##

In [None]:
generate_table_summary(comments, data_name='comments')
comments.sample(5)

** Commenting is less active than questioning and answering. ** 

In [None]:
generate_cum_plot(comments, date_col='comments_date_added', data_name='Comments')

** The comment distribution across members is also skewed. Some made more than 100 comments while most of them have only one comment. **

In [None]:
comments_authors_counts = create_group_by_counts(data=comments, 
                                                  group_by_col='comments_author_id', group_by_name = 'Users',
                                                  count_col='comments_id', count_name='Comment Numbers',
                                                  hist_cut_off=200, bin_numbers=100)
comments_authors_counts.describe()

** The comment distribution across questions/answers is also skewed. **

In [None]:
comments_qa_counts = create_group_by_counts(data=comments, 
                                                  group_by_col='comments_parent_content_id', group_by_name = 'Questions/Answers',
                                                  count_col='comments_id', count_name='Comment Numbers',
                                                  hist_cut_off=200, bin_numbers=100)
comments_qa_counts.describe()

## II.11. Tags ##

In [None]:
generate_table_summary(tags, data_name='tags')
tags.sample(10)

** Top 10 tags used by questions **

In [None]:
tags.merge(tag_questions, left_on='tags_tag_id', right_on='tag_questions_tag_id').groupby(
    'tags_tag_name')['tag_questions_question_id'].count().sort_values(ascending=False).head(10)

** Top 10 tags registered by users **

In [None]:
tags.merge(tag_users, left_on='tags_tag_id', right_on='tag_users_tag_id').groupby(
    'tags_tag_name')['tag_users_user_id'].count().sort_values(ascending=False).head(10)

## II.12. Tag Users ##

In [None]:
generate_table_summary(tag_users, data_name='tag users')
tag_users.head(3)

** The distribution of the numbers of tags registered by each user is skewed. Most users register for a very small number of tags. For example, the number of users who registered at most 3 tags is 18142. This sparsity of tag information could be one of the reasons to move away from tag-based recommendation methods. Historical activities and texts could provide more information to match questions to professionals. **

In [None]:
tags_users_counts = create_group_by_counts(data=tag_users, 
                                           group_by_col='tag_users_user_id', group_by_name = 'Users',
                                           count_col='tag_users_tag_id', count_name='Tag Numbers',
                                           hist_cut_off=200, bin_numbers=100)
tags_users_counts.describe()
print('The number of tag registering users having at most 3 tags is {}'.format(sum(tags_users_counts<=3)))

** Most of professionals have registered for their interested tags, i.e. 25594 out of 28152 professionals. **

In [None]:
professionals.merge(tags_users_counts.reset_index(), left_on='professionals_id', right_on='tag_users_user_id', how='inner').shape[0]

** Only a smaller portion of students registered their interested tags, i.e. 4608 out of 30971 students. **

In [None]:
students.merge(tags_users_counts.reset_index(), left_on='students_id', right_on='tag_users_user_id', how='inner').shape[0]

## II.13. Tag Questions ##

In [None]:
generate_table_summary(tag_questions, data_name='tag questions')
tag_questions.head(3)

** Each question on average has 3 tags. The tag distribution across questions is also skewed. **

In [None]:
tag_questions_counts = create_group_by_counts(data=tag_questions, 
                                                  group_by_col='tag_questions_question_id', group_by_name = 'Questions',
                                                  count_col='tag_questions_tag_id', count_name='Tag Numbers',
                                                  hist_cut_off=200, bin_numbers=100)
tag_questions_counts.describe()