In [1]:
import pandas as pd
import numpy as np

In [2]:
# Loading data
churn = pd.read_csv('daily-happiness-employee-turnover/churn.csv')
interactions = pd.read_csv('daily-happiness-employee-turnover/commentInteractions.csv')
comments = pd.read_csv('daily-happiness-employee-turnover/comments_clean_anonimized.csv')
votes = pd.read_csv('daily-happiness-employee-turnover/votes.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Collecting the unique list of companies
companies = pd.Series(churn['companyAlias'].unique())

In [4]:
# Since the companyAlias is too long, lets just replace it with an index for easy identification of company 
churn['companyAlias'] = churn['companyAlias'].apply(lambda x: companies.values.tolist().index(x) if x in companies.values else -1)
interactions['companyAlias'] = interactions['companyAlias'].apply(lambda x: companies.values.tolist().index(x) if x in companies.values else -1)
comments['companyAlias'] = comments['companyAlias'].apply(lambda x: companies.values.tolist().index(x) if x in companies.values else -1)
votes['companyAlias'] = votes['companyAlias'].apply(lambda x: companies.values.tolist().index(x) if x in companies.values else -1)

In [5]:
churn.tail()

Unnamed: 0,employee,companyAlias,numVotes,lastParticipationDate,stillExists
4842,857,35,1,Fri Mar 17 15:43:58 CET 2017,True
4843,858,35,1,Fri Mar 17 17:00:00 CET 2017,True
4844,859,35,1,Fri Mar 17 17:01:54 CET 2017,True
4845,17,36,7,Tue Jun 28 02:00:00 CEST 2016,True
4846,19,36,1,Mon Jul 04 19:02:03 CEST 2016,True


In [6]:
# Employees are unique only with the company, so lets create a unique employee id
votes['employee'] = votes['companyAlias'].astype(str) + "_" + votes['employee'].astype(str)
churn['employee'] = churn['companyAlias'].astype(str) + "_" + churn['employee'].astype(str)
interactions['employee'] = interactions['companyAlias'].astype(str) + "_" + interactions['employee'].astype(str)
comments['employee'] = comments['companyAlias'].astype(str) + "_" + comments['employee'].astype(str)

In [7]:
# We need to clean the dates - contains CET and CEST text
churn['lastParticipationDate'] = churn['lastParticipationDate'].str.replace('CET','').str.replace('CEST','')
churn['lastParticipationDate'] = pd.to_datetime(churn['lastParticipationDate'],format="%a %b %d %H:%M:%S %Y")

comments['commentDate'] = comments['commentDate'].str.replace('CET','').str.replace('CEST','')
comments['commentDate'] = pd.to_datetime(comments['commentDate'],format="%a %b %d %H:%M:%S %Y")

votes['voteDate'] = votes['voteDate'].str.replace('CET','').str.replace('CEST','')
votes['voteDate'] = pd.to_datetime(votes['voteDate'],format="%a %b %d %H:%M:%S %Y")

In [8]:
# Store the last participation date
max_date = churn.lastParticipationDate.max()
max_date

Timestamp('2017-03-20 21:25:54')

In [9]:
# Creating feature days_since_last_activity (max_date - lastParticipationDate)
churn['days_since_last_activity'] = (max_date - churn['lastParticipationDate']).dt.days
# Convert stillExists True/False to 1/0
churn.loc[:,['stillExists']] = churn['stillExists'].astype(int)
# Remove lastParticipationDate
churn.drop('lastParticipationDate', axis=1, inplace=True)

In [10]:
churn.head()

Unnamed: 0,employee,companyAlias,numVotes,stillExists,days_since_last_activity
0,0_512,0,4,1,25
1,0_-2,0,0,0,61
2,0_2,0,72,1,3
3,0_487,0,14,0,121
4,0_3,0,22,1,32


In [11]:
interactions.head()

Unnamed: 0,employee,companyAlias,liked,disliked,commentId
0,0_307,0,True,False,58d018d7e010990004e38070
1,0_36,0,True,False,58d018d7e010990004e38070
2,0_276,0,True,False,58d018d7e010990004e38070
3,0_24,0,True,False,58d018d7e010990004e38070
4,0_382,0,True,False,58d0179ae010990004e3806d


In [12]:
# Drop where companyAlias is -1
interactions = interactions[interactions.companyAlias!=-1]
# Convert True/False to 1/0
interactions.loc[:,['liked']] = interactions['liked'].astype(int)
interactions.loc[:,['disliked']] = interactions['disliked'].astype(int)

In [13]:
# How social are employees? How many times they have liked, disliked
emp_interactions_1 = interactions.groupby('employee')\
    .aggregate({'liked': 'sum','disliked': 'sum'})\
    .reset_index()
emp_interactions_1 = emp_interactions_1.rename(columns={'liked':'total_liked', 'disliked':'total_disliked'})
emp_interactions_1.head()

Unnamed: 0,employee,total_liked,total_disliked
0,0_-2,3,0
1,0_-4,4,2
2,0_-6,2,0
3,0_10,29,6
4,0_100,288,2


In [14]:
comments.head()

Unnamed: 0,employee,companyAlias,commentId,txt,likes,dislikes,commentDate
0,0_307,0,58d018d7e010990004e38070,**********************************************...,4.0,0.0,2017-03-20 19:00:17
1,0_382,0,58d0179ae010990004e3806d,*****************************,1.0,2.0,2017-03-20 18:55:16
2,0_172,0,58cff8cde010990004e37f6a,***************************,3.0,0.0,2017-03-20 16:44:02
3,0_135,0,58cfefeee010990004e37f60,***************************,1.0,1.0,2017-03-20 16:06:08
4,0_225,0,58cfd9b4e010990004e37f52,*********************************,3.0,2.0,2017-03-20 14:30:50


In [15]:
# txt column is hardly useful. Maybe length of comment might be a useful feature
# But we have a lot of NANs in txt. 
# So lets assign those txt as blank as comment could have been emoji or gif because it had likes and dislikes
comments = comments.dropna(subset=['commentDate'])
comments['txt'] = comments['txt'].fillna('')
comments['txt'] = comments['txt'].apply(lambda x: len(x))

In [16]:
comments.head()

Unnamed: 0,employee,companyAlias,commentId,txt,likes,dislikes,commentDate
0,0_307,0,58d018d7e010990004e38070,62,4.0,0.0,2017-03-20 19:00:17
1,0_382,0,58d0179ae010990004e3806d,29,1.0,2.0,2017-03-20 18:55:16
2,0_172,0,58cff8cde010990004e37f6a,27,3.0,0.0,2017-03-20 16:44:02
3,0_135,0,58cfefeee010990004e37f60,27,1.0,1.0,2017-03-20 16:06:08
4,0_225,0,58cfd9b4e010990004e37f52,33,3.0,2.0,2017-03-20 14:30:50


In [17]:
# How social are employees? How many times they have commented, received likes, dislikes
emp_interactions_2 = comments.groupby('employee')\
    .aggregate({'commentId': 'count','txt': 'mean','likes':'sum','dislikes':'sum'})\
    .reset_index()
emp_interactions_2 = emp_interactions_2.rename(columns={'commentId':'total_comments', 'txt':'avg_comment_len', 'likes':'likes_received', 'dislikes':'dislikes_received'})
emp_interactions_2['avg_comment_len'] = round(emp_interactions_2['avg_comment_len'])
emp_interactions_2.head()

Unnamed: 0,employee,total_comments,avg_comment_len,likes_received,dislikes_received
0,0_-3,1,92.0,6.0,0.0
1,0_-4,22,54.0,116.0,46.0
2,0_-5,13,203.0,66.0,72.0
3,0_-7,3,117.0,10.0,1.0
4,0_10,6,86.0,16.0,20.0


In [18]:
# Joining both the data sets to get all social activities of employee
emp_social_data = pd.merge(emp_interactions_1, emp_interactions_2, on=['employee'], how = 'left')
# Missing ones those who just never wrote a comment. Hence imputing zero
emp_social_data = emp_social_data.fillna(0)

In [19]:
emp_social_data.head()

Unnamed: 0,employee,total_liked,total_disliked,total_comments,avg_comment_len,likes_received,dislikes_received
0,0_-2,3,0,0.0,0.0,0.0,0.0
1,0_-4,4,2,22.0,54.0,116.0,46.0
2,0_-6,2,0,0.0,0.0,0.0,0.0
3,0_10,29,6,6.0,86.0,16.0,20.0
4,0_100,288,2,60.0,64.0,326.0,50.0


In [20]:
emp_social_churn_data = pd.merge(emp_social_data, churn, on=['employee'], how = 'left')
emp_social_churn_data.head()

Unnamed: 0,employee,total_liked,total_disliked,total_comments,avg_comment_len,likes_received,dislikes_received,companyAlias,numVotes,stillExists,days_since_last_activity
0,0_-2,3,0,0.0,0.0,0.0,0.0,0,0,0,61
1,0_-4,4,2,22.0,54.0,116.0,46.0,0,0,0,133
2,0_-6,2,0,0.0,0.0,0.0,0.0,0,0,0,146
3,0_10,29,6,6.0,86.0,16.0,20.0,0,56,1,10
4,0_100,288,2,60.0,64.0,326.0,50.0,0,187,1,0


In [21]:
votes.head()

Unnamed: 0,employee,companyAlias,voteDate,vote
0,0_31,0,2016-02-01 01:00:00,4
1,0_33,0,2016-02-01 01:00:00,4
2,0_79,0,2016-02-01 01:00:00,4
3,0_94,0,2016-02-01 01:00:00,4
4,0_16,0,2016-02-01 01:00:00,2


In [22]:
# How happy are employees?
emp_votes = votes.groupby('employee')\
    .aggregate({'vote': 'mean'})\
    .rename(columns={'vote':'avg_vote'})\
    .reset_index()
emp_votes['avg_vote'] = emp_votes['avg_vote'].apply(lambda x: int(round(x)))

In [23]:
emp_votes.head()

Unnamed: 0,employee,avg_vote
0,0_10,3
1,0_100,4
2,0_101,3
3,0_102,4
4,0_103,4


In [24]:
# complete employee profile
emp_profile = pd.merge(emp_social_churn_data, emp_votes, on=['employee'], how = 'left')
# missing value in avg_vote are those who never voted so imputing as 0
emp_profile = emp_profile.fillna(0)
emp_profile.head()

Unnamed: 0,employee,total_liked,total_disliked,total_comments,avg_comment_len,likes_received,dislikes_received,companyAlias,numVotes,stillExists,days_since_last_activity,avg_vote
0,0_-2,3,0,0.0,0.0,0.0,0.0,0,0,0,61,0.0
1,0_-4,4,2,22.0,54.0,116.0,46.0,0,0,0,133,0.0
2,0_-6,2,0,0.0,0.0,0.0,0.0,0,0,0,146,0.0
3,0_10,29,6,6.0,86.0,16.0,20.0,0,56,1,10,3.0
4,0_100,288,2,60.0,64.0,326.0,50.0,0,187,1,0,4.0


In [25]:
# Check missing data
emp_profile.columns[emp_profile.isnull().any()]

Index([], dtype='object')

In [26]:
# Creating likeability, dislikeability
emp_profile['likeability'] = emp_profile[['likes_received', 'dislikes_received']].apply(lambda x: x.iloc[0]/(x.iloc[0]+x.iloc[1]) if x.iloc[0] != 0 else 0, axis=1)
emp_profile['dislikeability'] = emp_profile[['likes_received', 'dislikes_received']].apply(lambda x: x.iloc[1]/(x.iloc[0]+x.iloc[1]) if x.iloc[1] != 0 else 0, axis=1)
emp_profile['likeability'] = round(emp_profile['likeability'],2)
emp_profile['dislikeability'] = round(emp_profile['dislikeability'],2)

#social index feature
emp_profile['social_index']=0.5*emp_profile['numVotes']+0.3*emp_profile['total_comments']+0.2*(emp_profile['total_liked']+emp_profile['total_disliked'])

In [27]:
emp_profile.head()

Unnamed: 0,employee,total_liked,total_disliked,total_comments,avg_comment_len,likes_received,dislikes_received,companyAlias,numVotes,stillExists,days_since_last_activity,avg_vote,likeability,dislikeability,social_index
0,0_-2,3,0,0.0,0.0,0.0,0.0,0,0,0,61,0.0,0.0,0.0,0.6
1,0_-4,4,2,22.0,54.0,116.0,46.0,0,0,0,133,0.0,0.72,0.28,7.8
2,0_-6,2,0,0.0,0.0,0.0,0.0,0,0,0,146,0.0,0.0,0.0,0.4
3,0_10,29,6,6.0,86.0,16.0,20.0,0,56,1,10,3.0,0.44,0.56,36.8
4,0_100,288,2,60.0,64.0,326.0,50.0,0,187,1,0,4.0,0.87,0.13,169.5


In [28]:
emp_profile.shape

(3549, 15)

In [29]:
emp_profile.to_csv("emp_features.csv", index=False)

In [30]:
# Company level features
company_profile = emp_profile.groupby('companyAlias')\
    .aggregate({'employee': 'count','total_comments': 'sum','avg_comment_len': 'mean','likes_received': 'sum','dislikes_received': 'sum','numVotes': 'sum','avg_vote': 'mean'})\
    .rename(columns={'employee': 'num_emp','total_comments': 'com_total_comments','avg_comment_len': 'com_avg_comment_len','likes_received': 'com_total_likes','dislikes_received': 'com_total_dislikes','numVotes': 'com_total_votes', 'avg_vote': 'com_avg_vote'})\
    .reset_index()
company_profile['com_avg_vote'] = round(company_profile['com_avg_vote'])
company_profile['com_avg_comment_len'] = round(company_profile['com_avg_comment_len'],2)

In [31]:
company_profile.head()

Unnamed: 0,companyAlias,num_emp,com_total_comments,com_avg_comment_len,com_total_likes,com_total_dislikes,com_total_votes,com_avg_vote
0,0,303,10088.0,65.8,66454.0,17024.0,26384,3.0
1,1,14,348.0,23.14,120.0,72.0,400,3.0
2,2,142,2670.0,36.98,7988.0,1430.0,3446,1.0
3,3,112,2770.0,95.46,18310.0,3642.0,6262,3.0
4,4,119,1936.0,73.76,8134.0,1138.0,3341,3.0


In [32]:
# Creating master data
master_data = pd.merge(emp_profile, company_profile, on=['companyAlias'], how = 'left')
master_data.head()

Unnamed: 0,employee,total_liked,total_disliked,total_comments,avg_comment_len,likes_received,dislikes_received,companyAlias,numVotes,stillExists,...,likeability,dislikeability,social_index,num_emp,com_total_comments,com_avg_comment_len,com_total_likes,com_total_dislikes,com_total_votes,com_avg_vote
0,0_-2,3,0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0.6,303,10088.0,65.8,66454.0,17024.0,26384,3.0
1,0_-4,4,2,22.0,54.0,116.0,46.0,0,0,0,...,0.72,0.28,7.8,303,10088.0,65.8,66454.0,17024.0,26384,3.0
2,0_-6,2,0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0.4,303,10088.0,65.8,66454.0,17024.0,26384,3.0
3,0_10,29,6,6.0,86.0,16.0,20.0,0,56,1,...,0.44,0.56,36.8,303,10088.0,65.8,66454.0,17024.0,26384,3.0
4,0_100,288,2,60.0,64.0,326.0,50.0,0,187,1,...,0.87,0.13,169.5,303,10088.0,65.8,66454.0,17024.0,26384,3.0


In [33]:
master_data['relative_happiness']=master_data['avg_vote']/master_data['com_avg_vote']
master_data['relative_happiness']= round(master_data['relative_happiness'], 2)

In [34]:
master_data.head()

Unnamed: 0,employee,total_liked,total_disliked,total_comments,avg_comment_len,likes_received,dislikes_received,companyAlias,numVotes,stillExists,...,dislikeability,social_index,num_emp,com_total_comments,com_avg_comment_len,com_total_likes,com_total_dislikes,com_total_votes,com_avg_vote,relative_happiness
0,0_-2,3,0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.6,303,10088.0,65.8,66454.0,17024.0,26384,3.0,0.0
1,0_-4,4,2,22.0,54.0,116.0,46.0,0,0,0,...,0.28,7.8,303,10088.0,65.8,66454.0,17024.0,26384,3.0,0.0
2,0_-6,2,0,0.0,0.0,0.0,0.0,0,0,0,...,0.0,0.4,303,10088.0,65.8,66454.0,17024.0,26384,3.0,0.0
3,0_10,29,6,6.0,86.0,16.0,20.0,0,56,1,...,0.56,36.8,303,10088.0,65.8,66454.0,17024.0,26384,3.0,1.0
4,0_100,288,2,60.0,64.0,326.0,50.0,0,187,1,...,0.13,169.5,303,10088.0,65.8,66454.0,17024.0,26384,3.0,1.33


In [35]:
master_data.to_csv("emp_com_features.csv", index=False)