---
# Dumping user wise strengths features train data

In [33]:
# Loading Libraries
import pandas as pd
import numpy as np
from tabulate import tabulate

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split as ttsplit

import constants

RANDOM_STATE = constants.RANDOM_STATE

In [43]:
# Loading training data
df_train = pd.read_csv(constants.OUTPUT_FILE)
print(df_train.shape)
df_train.head()

(30916, 60)


Unnamed: 0,from-to,score,userA_id,userB_id,has_common_skills,n_common_skills,has_common_schools,n_common_schools,has_common_industry,n_common_industry,...,userA_strength_id_7,userA_strength_id_8,userB_strength_id_1,userB_strength_id_2,userB_strength_id_3,userB_strength_id_4,userB_strength_id_5,userB_strength_id_6,userB_strength_id_7,userB_strength_id_8
0,4769697-6308211,3.0,4769697.0,6308211.0,0.0,-1.0,0.0,-1.0,0.0,0.0,...,1.0,1.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
1,6275668-6308211,2.0,6275668.0,6308211.0,1.0,-2.0,0.0,-1.0,0.0,0.0,...,0.0,3.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
2,6307254-6308211,3.0,6307254.0,6308211.0,0.0,-1.0,0.0,-1.0,0.0,0.0,...,1.0,3.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
3,1208644-6308211,2.0,1208644.0,6308211.0,0.0,0.0,0.0,-1.0,1.0,1.0,...,0.0,0.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
4,6275807-6308211,3.0,6275807.0,6308211.0,1.0,-2.0,0.0,-1.0,1.0,1.0,...,0.0,1.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0


## Loading Lookup dataFrame


In [35]:
# Loading skills table to create a lookup to create feature from
df_strengths = pd.read_csv(constants.DATA_PATH+'user_strengths.csv')
df_strengths.head()

Unnamed: 0,user_id,strength_id_1,strength_id_2,strength_id_3,strength_id_4,strength_id_5,strength_id_6,strength_id_7,strength_id_8
0,1,8,5,2,17,9,12,3,8
1,41,1,2,1,1,1,1,1,1
2,51,0,0,1,2,0,0,0,2
3,151,5,2,0,7,1,2,1,4
4,161,0,0,1,1,0,1,0,0


In [36]:
# dropping duplicates
df_strengths.drop_duplicates(inplace=True)
df_strengths.dropna(axis=0, inplace=True)

In [37]:
# Preview
df_strengths[ df_strengths['user_id']==151]

Unnamed: 0,user_id,strength_id_1,strength_id_2,strength_id_3,strength_id_4,strength_id_5,strength_id_6,strength_id_7,strength_id_8
3,151,5,2,0,7,1,2,1,4


### Creating features

In [38]:
# Dumping userA purposes
temp = df_strengths.add_prefix('userA_').rename(columns={'userA_user_id': 'userA_id'})
df_train = pd.merge(df_train, temp, how='outer', on='userA_id')
#
# Dumping userB purposes
temp = df_strengths.add_prefix('userB_').rename(columns={'userB_user_id': 'userB_id'})
df_train = pd.merge(df_train, temp, how='outer', on='userB_id')

In [39]:
# Preview of new features
df_train.head()

Unnamed: 0,from-to,score,userA_id,userB_id,has_common_skills,n_common_skills,has_common_schools,n_common_schools,has_common_industry,n_common_industry,...,userA_strength_id_7,userA_strength_id_8,userB_strength_id_1,userB_strength_id_2,userB_strength_id_3,userB_strength_id_4,userB_strength_id_5,userB_strength_id_6,userB_strength_id_7,userB_strength_id_8
0,4769697-6308211,3.0,4769697.0,6308211.0,0.0,-1.0,0.0,-1.0,0.0,0.0,...,1.0,1.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
1,6275668-6308211,2.0,6275668.0,6308211.0,1.0,-2.0,0.0,-1.0,0.0,0.0,...,0.0,3.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
2,6307254-6308211,3.0,6307254.0,6308211.0,0.0,-1.0,0.0,-1.0,0.0,0.0,...,1.0,3.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
3,1208644-6308211,2.0,1208644.0,6308211.0,0.0,0.0,0.0,-1.0,1.0,1.0,...,0.0,0.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
4,6275807-6308211,3.0,6275807.0,6308211.0,1.0,-2.0,0.0,-1.0,1.0,1.0,...,0.0,1.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0


In [40]:
# Delaing with missing values
df_train = df_train.dropna(subset=['score']).fillna(0)

In [41]:
# Preview of new features
df_train.head()

Unnamed: 0,from-to,score,userA_id,userB_id,has_common_skills,n_common_skills,has_common_schools,n_common_schools,has_common_industry,n_common_industry,...,userA_strength_id_7,userA_strength_id_8,userB_strength_id_1,userB_strength_id_2,userB_strength_id_3,userB_strength_id_4,userB_strength_id_5,userB_strength_id_6,userB_strength_id_7,userB_strength_id_8
0,4769697-6308211,3.0,4769697.0,6308211.0,0.0,-1.0,0.0,-1.0,0.0,0.0,...,1.0,1.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
1,6275668-6308211,2.0,6275668.0,6308211.0,1.0,-2.0,0.0,-1.0,0.0,0.0,...,0.0,3.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
2,6307254-6308211,3.0,6307254.0,6308211.0,0.0,-1.0,0.0,-1.0,0.0,0.0,...,1.0,3.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
3,1208644-6308211,2.0,1208644.0,6308211.0,0.0,0.0,0.0,-1.0,1.0,1.0,...,0.0,0.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0
4,6275807-6308211,3.0,6275807.0,6308211.0,1.0,-2.0,0.0,-1.0,1.0,1.0,...,0.0,1.0,1.0,3.0,3.0,15.0,6.0,14.0,0.0,8.0


--- 
Writing to file
---

In [42]:
# Writing to output
df_train.to_csv(constants.OUTPUT_FILE,
                index=False,
                header=True)