In [1]:
import warnings
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
warnings.filterwarnings("ignore")

In [2]:
edu = pd.read_csv('PREPROCESSED_DATA_2/education_SECOND_detailed_fe.csv')
exp = pd.read_csv('PREPROCESSED_DATA_2/exp_final_no_external.csv')
skill = pd.read_csv('PREPROCESSED_DATA_2/skills_SECOND_fe_w_ext_data.csv')
lang = pd.read_csv('PREPROCESSED_DATA_2/lang_SECOND_fe_no_external.csv')
train = pd.read_csv('PREPROCESSED_DATA_2/train_final.csv')
test = pd.read_csv('PREPROCESSED_DATA_2/test_final.csv')

***
# <font color = "RoyalBlue">Education</font>
***



__<font color = "darkgreen">Number of schools added to Linkedin system</font>__

In [3]:
edu.drop_duplicates(inplace=True)

edu_counts=edu.groupby('user_id')['school_name','degree'].count()

edu_counts.drop(['degree'],axis=1,inplace=True)
edu_counts.rename({'school_name' : 'school_count'},axis=1,inplace=True)

edu_counts

Unnamed: 0_level_0,school_count
user_id,Unnamed: 1_level_1
0,3
1,2
2,2
3,1
4,1
...,...
66269,3
66270,4
66271,3
66272,3


__<font color = "darkgreen">Users add an average of two schools. New features can be obtained by taking the name, degree, and field information of the last two added schools.</font>__

In [4]:
print(edu_counts.school_count.mode()[0])
print(edu_counts.school_count.median())
print(edu_counts.school_count.max())
print(edu_counts.school_count.min())
print(edu_counts.school_count.mean())

2
2.0
9
1
2.114680629536298


In [5]:
edu_new = pd.DataFrame()

edu_new['school_last'] = edu.groupby('user_id')['school_name'].nth(0)
edu_new['school_previous'] = edu.groupby('user_id')['school_name'].nth(1)

edu_new['degree_last'] = edu.groupby('user_id')['degree'].nth(0)
edu_new['degree_previous'] = edu.groupby('user_id')['degree'].nth(1)

edu_new['fos_last'] = edu.groupby('user_id')['fields_of_study'].nth(0)
edu_new['fos_previous'] = edu.groupby('user_id')['fields_of_study'].nth(1)

edu_new

Unnamed: 0_level_0,school_last,school_previous,degree_last,degree_previous,fos_last,fos_previous
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Anadolu University,non valid or other,non valid or unknown,non valid or unknown,non valid or unknown,non valid or unknown
1,non valid or other,non valid or other,master,bachelor,software engineering,software engineering
2,non valid or other,High School,non valid or unknown,high school,computer engineering,non valid or unknown
3,non valid or other,,bachelor,,software engineering,
4,non valid or other,,bachelor,,software engineering,
...,...,...,...,...,...,...
66269,Bogazici University,Bogazici University,master,bachelor,business administration,computer engineering
66270,Bogazici University,non valid or other,master,non valid or unknown,physics,non valid or unknown
66271,Bogazici University,Bogazici University,master,bachelor,non valid or unknown,non valid or unknown
66272,Yildiz Technical University,non valid or other,bachelor,non valid or unknown,computer engineering,computer science


__<font color = "darkgreen">We dropped the columns containing start and end dates of schools as they had too many missing values to be filled, but I decided to create a feature that estimates the duration of education based on the degrees obtained.</font>__

In [6]:
edu_years = edu[['user_id','degree']]

edu_years.degree = edu_years.degree.map({'high school' : 4, 'associate' : 2, 'bachelor' : 4, 'master' : 2, 'phd' : 4})

edu_years = edu_years.groupby('user_id').sum()
edu_years.rename(columns={'degree' : 'years_of_study'},inplace=True)

edu_new = edu_new.join(edu_years)

edu_new =  edu_new.join(edu_counts)

edu_new

Unnamed: 0_level_0,school_last,school_previous,degree_last,degree_previous,fos_last,fos_previous,years_of_study,school_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Anadolu University,non valid or other,non valid or unknown,non valid or unknown,non valid or unknown,non valid or unknown,0.0,3
1,non valid or other,non valid or other,master,bachelor,software engineering,software engineering,6.0,2
2,non valid or other,High School,non valid or unknown,high school,computer engineering,non valid or unknown,4.0,2
3,non valid or other,,bachelor,,software engineering,,4.0,1
4,non valid or other,,bachelor,,software engineering,,4.0,1
...,...,...,...,...,...,...,...,...
66269,Bogazici University,Bogazici University,master,bachelor,business administration,computer engineering,6.0,3
66270,Bogazici University,non valid or other,master,non valid or unknown,physics,non valid or unknown,10.0,4
66271,Bogazici University,Bogazici University,master,bachelor,non valid or unknown,non valid or unknown,10.0,3
66272,Yildiz Technical University,non valid or other,bachelor,non valid or unknown,computer engineering,computer science,4.0,3


__<font color = "darkgreen">The sum of points for obtained degrees.</font>__

In [7]:
edu_degrees = edu[['user_id','degree']]

edu_degrees.degree = edu_degrees.degree.map({'non valid or unknown' : 0, 'student' : 1, 'high school' : 2,
                                             'associate' : 3, 'bachelor' : 4, 'master' : 5, 'phd' : 6})

edu_degrees = edu_degrees.groupby('user_id').sum()
edu_degrees.rename({'degree' : 'degree_sum'},axis=1,inplace=True)

edu_new = edu_new.join(edu_degrees)
edu_new

Unnamed: 0_level_0,school_last,school_previous,degree_last,degree_previous,fos_last,fos_previous,years_of_study,school_count,degree_sum
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Anadolu University,non valid or other,non valid or unknown,non valid or unknown,non valid or unknown,non valid or unknown,0.0,3,0
1,non valid or other,non valid or other,master,bachelor,software engineering,software engineering,6.0,2,9
2,non valid or other,High School,non valid or unknown,high school,computer engineering,non valid or unknown,4.0,2,2
3,non valid or other,,bachelor,,software engineering,,4.0,1,4
4,non valid or other,,bachelor,,software engineering,,4.0,1,4
...,...,...,...,...,...,...,...,...,...
66269,Bogazici University,Bogazici University,master,bachelor,business administration,computer engineering,6.0,3,9
66270,Bogazici University,non valid or other,master,non valid or unknown,physics,non valid or unknown,10.0,4,13
66271,Bogazici University,Bogazici University,master,bachelor,non valid or unknown,non valid or unknown,10.0,3,15
66272,Yildiz Technical University,non valid or other,bachelor,non valid or unknown,computer engineering,computer science,4.0,3,4


__<font color = "darkgreen">The academic score feature that I acquired with multiplying the sum of the degrees and total number of schools attended,and another set of feature that determines whether there is a difference between the last two schools / departments / degrees of the user. If the user has entered only one school in the system, there could be no change, and thus change will be marked as null for that user.</font>__

In [8]:
edu_new['academic_points'] = edu_new.school_count * edu_new.degree_sum

edu_new['fos_change'] = np.where(pd.isnull(edu_new.fos_previous), np.nan, np.where(edu_new.fos_last == edu_new.fos_previous, 0, 1))
edu_new['school_change'] = np.where(pd.isnull(edu_new.school_previous), np.nan, np.where(edu_new.school_last == edu_new.school_previous, 0, 1))
edu_new['degree_change'] = np.where(pd.isnull(edu_new.degree_previous), np.nan, np.where(edu_new.degree_last == edu_new.degree_previous, 0, 1))

edu_new.reset_index(inplace=True)
edu_new

Unnamed: 0,user_id,school_last,school_previous,degree_last,degree_previous,fos_last,fos_previous,years_of_study,school_count,degree_sum,academic_points,fos_change,school_change,degree_change
0,0,Anadolu University,non valid or other,non valid or unknown,non valid or unknown,non valid or unknown,non valid or unknown,0.0,3,0,0,0.0,1.0,0.0
1,1,non valid or other,non valid or other,master,bachelor,software engineering,software engineering,6.0,2,9,18,0.0,0.0,1.0
2,2,non valid or other,High School,non valid or unknown,high school,computer engineering,non valid or unknown,4.0,2,2,4,1.0,1.0,1.0
3,3,non valid or other,,bachelor,,software engineering,,4.0,1,4,4,,,
4,4,non valid or other,,bachelor,,software engineering,,4.0,1,4,4,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66266,66269,Bogazici University,Bogazici University,master,bachelor,business administration,computer engineering,6.0,3,9,27,1.0,0.0,1.0
66267,66270,Bogazici University,non valid or other,master,non valid or unknown,physics,non valid or unknown,10.0,4,13,52,1.0,1.0,1.0
66268,66271,Bogazici University,Bogazici University,master,bachelor,non valid or unknown,non valid or unknown,10.0,3,15,45,0.0,0.0,1.0
66269,66272,Yildiz Technical University,non valid or other,bachelor,non valid or unknown,computer engineering,computer science,4.0,3,4,12,1.0,1.0,1.0


In [9]:
train = train.merge(edu_new,how='left',on='user_id')
test = test.merge(edu_new,how='left',on='user_id')

***
# <font color = "royalblue">Work Experiences</font>
***

In [10]:
exp.drop_duplicates(inplace=True)

exp.rename(columns={'location' : 'work_city', 'location2' : 'work_country'},inplace=True)
exp

Unnamed: 0,user_id,company_id,work_city,start_year_month,work_country
0,53442,2651,İstanbul,201509,Turkey
1,34558,815,İstanbul,201210,Turkey
2,63761,26354,Foreign or Non-Valid,200010,Unknown or Non-Valid
3,10738,89,Foreign or Non-Valid,201610,Unknown or Non-Valid
4,8711,3113,İstanbul,201801,Turkey
...,...,...,...,...,...
160731,5368,6126,Foreign or Non-Valid,201806,Unknown or Non-Valid
160732,22180,15065,Kastamonu,201205,Turkey
160733,55822,25076,Foreign or Non-Valid,201506,Unknown or Non-Valid
160734,3679,1414,Ankara,201807,Turkey


__<font color = "darkgreen">Number of schools added to Linkedin system</font>__

In [11]:
job_count = exp.groupby('user_id').count()
job_count.drop(columns=['work_city','start_year_month','work_country'],axis=1,inplace=True)
job_count.rename(columns={'company_id' : 'job_count'},inplace=True)
job_count

Unnamed: 0_level_0,job_count
user_id,Unnamed: 1_level_1
0,1
2,3
5,1
7,4
10,2
...,...
66269,5
66270,2
66271,9
66272,5


__<font color = "darkgreen">Users add an average of three jobs, but those who add only one work experience are in the majority, and they will also be divided into two as the last job/previous job.</font>__

In [12]:
print(job_count.job_count.mode()[0])
print(job_count.job_count.max())
print(job_count.job_count.mean())
print(job_count.job_count.median())

1
25
3.0289800384891135
3.0


__<font color = "darkgreen">Last work experience</font>__

In [13]:
exp_sorted = exp.sort_values(by='start_year_month',ascending=False)
exp_sorted = exp_sorted.groupby('user_id').nth(0)
exp_sorted.rename(columns={'company_id' : 'last_company_id', 'work_city' : 'last_work_city', 
                                   'start_year_month' : 'last_start_year_month', 'work_country' : 'last_work_country'},inplace=True)
exp_sorted

Unnamed: 0_level_0,last_company_id,last_work_city,last_start_year_month,last_work_country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,Foreign or Non-Valid,200509,Unknown or Non-Valid
2,10,Mersin,201806,Turkey
5,15,İstanbul,201706,Turkey
7,20,Elazığ,201812,Turkey
10,26,İstanbul,201805,Turkey
...,...,...,...,...
66269,264,İstanbul,201812,Turkey
66270,4046,İstanbul,201812,Turkey
66271,890,İstanbul,201611,Turkey
66272,19477,İstanbul,201812,Turkey


__<font color = "darkgreen">Previous work experience</font>__

In [14]:
exp_sorted2 = exp.sort_values(by='start_year_month',ascending=False)
exp_sorted2 = exp_sorted2.groupby('user_id').nth(1)
exp_sorted2.rename(columns={'company_id' : 'previous_company_id', 'work_city' : 'previous_work_city', 
                                   'start_year_month' : 'previous_start_year_month', 'work_country' : 'previous_work_country'},inplace=True)
exp_sorted2

Unnamed: 0_level_0,previous_company_id,previous_work_city,previous_start_year_month,previous_work_country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,7,Elazığ,201706,Turkey
7,21,Foreign or Non-Valid,201708,Macedonia [FYROM]
10,27,Elazığ,201308,Turkey
11,30,İstanbul,201708,Turkey
12,34,Foreign or Non-Valid,201607,Unknown or Non-Valid
...,...,...,...,...
66269,264,İstanbul,201208,Turkey
66270,4046,Foreign or Non-Valid,201411,Unknown or Non-Valid
66271,5549,İstanbul,201511,Turkey
66272,944,İstanbul,201812,Turkey


In [15]:
exp_sorted = exp_sorted.join(exp_sorted2)

exp_sorted.reset_index(inplace=True)

exp_sorted

Unnamed: 0,user_id,last_company_id,last_work_city,last_start_year_month,last_work_country,previous_company_id,previous_work_city,previous_start_year_month,previous_work_country
0,0,0,Foreign or Non-Valid,200509,Unknown or Non-Valid,,,,
1,2,10,Mersin,201806,Turkey,7.0,Elazığ,201706.0,Turkey
2,5,15,İstanbul,201706,Turkey,,,,
3,7,20,Elazığ,201812,Turkey,21.0,Foreign or Non-Valid,201708.0,Macedonia [FYROM]
4,10,26,İstanbul,201805,Turkey,27.0,Elazığ,201308.0,Turkey
...,...,...,...,...,...,...,...,...,...
52997,66269,264,İstanbul,201812,Turkey,264.0,İstanbul,201208.0,Turkey
52998,66270,4046,İstanbul,201812,Turkey,4046.0,Foreign or Non-Valid,201411.0,Unknown or Non-Valid
52999,66271,890,İstanbul,201611,Turkey,5549.0,İstanbul,201511.0,Turkey
53000,66272,19477,İstanbul,201812,Turkey,944.0,İstanbul,201812.0,Turkey


__<font color = "darkgreen">The number of employees in companies</font>__

In [16]:
company_size = pd.DataFrame(data=exp.company_id.value_counts().values,index=exp.company_id.value_counts().index,
             columns=['last_company_size']).reset_index().rename(columns={'index' : 'last_company_id'})

company_size2 = pd.DataFrame(data=exp.company_id.value_counts().values,index=exp.company_id.value_counts().index,
             columns=['previous_company_size']).reset_index().rename(columns={'index' : 'previous_company_id'})

exp_sorted = exp_sorted.merge(company_size,how='left',on='last_company_id')
exp_sorted = exp_sorted.merge(company_size2,how='left',on='previous_company_id')

exp_sorted

Unnamed: 0,user_id,last_company_id,last_work_city,last_start_year_month,last_work_country,previous_company_id,previous_work_city,previous_start_year_month,previous_work_country,last_company_size,previous_company_size
0,0,0,Foreign or Non-Valid,200509,Unknown or Non-Valid,,,,,5,
1,2,10,Mersin,201806,Turkey,7.0,Elazığ,201706.0,Turkey,6,4.0
2,5,15,İstanbul,201706,Turkey,,,,,3,
3,7,20,Elazığ,201812,Turkey,21.0,Foreign or Non-Valid,201708.0,Macedonia [FYROM],6,138.0
4,10,26,İstanbul,201805,Turkey,27.0,Elazığ,201308.0,Turkey,2060,47.0
...,...,...,...,...,...,...,...,...,...,...,...
52997,66269,264,İstanbul,201812,Turkey,264.0,İstanbul,201208.0,Turkey,746,746.0
52998,66270,4046,İstanbul,201812,Turkey,4046.0,Foreign or Non-Valid,201411.0,Unknown or Non-Valid,34,34.0
52999,66271,890,İstanbul,201611,Turkey,5549.0,İstanbul,201511.0,Turkey,699,21.0
53000,66272,19477,İstanbul,201812,Turkey,944.0,İstanbul,201812.0,Turkey,8,888.0


__<font color = "darkgreen">Similarly, a new feature will be created to capture the company city-country changes between the previous and new job of individuals.</font>__

__<font color = "darkgreen"> Additionally, a feature that compares the size of the companies by user's previous job and current job.</font>__

__<font color = "darkgreen">And in addition to all of these, a new feature that keeps track of the difference between the start date of the current job and the start date of the previous job, meaning how many days were spent in the previous job, approximately.</font>__

In [17]:
exp_sorted['company_id_change'] = np.where(pd.isnull(exp_sorted.previous_company_id), np.nan, np.where(exp_sorted.last_company_id == exp_sorted.previous_company_id, 0, 1))
exp_sorted['work_city-country_change'] = np.where((pd.isnull(exp_sorted.previous_work_city)) &  (pd.isnull(exp_sorted.previous_work_country)), np.nan, np.where((exp_sorted.last_work_city + exp_sorted.last_work_country) == (exp_sorted.previous_work_city + exp_sorted.previous_work_country), 0, 1))
exp_sorted['company_size_increase'] = np.where(pd.isnull(exp_sorted.previous_company_size), np.nan, np.where(exp_sorted.last_company_size > exp_sorted.previous_company_size, 1, 0))
exp_sorted['previous_job_days'] = np.where(pd.isnull(exp_sorted.previous_start_year_month), np.nan, (pd.to_datetime(exp_sorted.last_start_year_month,format='%Y%m') - pd.to_datetime(exp_sorted.previous_start_year_month,format='%Y%m')).dt.days )

exp_sorted

Unnamed: 0,user_id,last_company_id,last_work_city,last_start_year_month,last_work_country,previous_company_id,previous_work_city,previous_start_year_month,previous_work_country,last_company_size,previous_company_size,company_id_change,work_city-country_change,company_size_increase,previous_job_days
0,0,0,Foreign or Non-Valid,200509,Unknown or Non-Valid,,,,,5,,,,,
1,2,10,Mersin,201806,Turkey,7.0,Elazığ,201706.0,Turkey,6,4.0,1.0,1.0,1.0,365.0
2,5,15,İstanbul,201706,Turkey,,,,,3,,,,,
3,7,20,Elazığ,201812,Turkey,21.0,Foreign or Non-Valid,201708.0,Macedonia [FYROM],6,138.0,1.0,1.0,0.0,487.0
4,10,26,İstanbul,201805,Turkey,27.0,Elazığ,201308.0,Turkey,2060,47.0,1.0,1.0,1.0,1734.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52997,66269,264,İstanbul,201812,Turkey,264.0,İstanbul,201208.0,Turkey,746,746.0,0.0,0.0,0.0,2313.0
52998,66270,4046,İstanbul,201812,Turkey,4046.0,Foreign or Non-Valid,201411.0,Unknown or Non-Valid,34,34.0,0.0,1.0,0.0,1491.0
52999,66271,890,İstanbul,201611,Turkey,5549.0,İstanbul,201511.0,Turkey,699,21.0,1.0,0.0,1.0,366.0
53000,66272,19477,İstanbul,201812,Turkey,944.0,İstanbul,201812.0,Turkey,8,888.0,1.0,0.0,0.0,0.0


In [18]:
train = train.merge(exp_sorted,how='left',on='user_id')
test = test.merge(exp_sorted,how='left',on='user_id')

__<font color = "darkgreen">Creating a table for the new features I will create using the total number of jobs for users and the difference between the start date of the oldest job and 2019.</font>__

In [19]:
average_job_days= pd.DataFrame()

average_job_days['first_starting_time'] = exp.groupby('user_id')['start_year_month'].min()
average_job_days['last_starting_time'] = exp.groupby('user_id')['start_year_month'].max()

average_job_days['first_starting_time'] = pd.to_datetime(average_job_days['first_starting_time'], format='%Y%m')
average_job_days['last_starting_time'] = pd.to_datetime(average_job_days['last_starting_time'], format='%Y%m')

average_job_days['_2019_first_diff'] = datetime(2019, 1, 1) - average_job_days['first_starting_time']
average_job_days['days_until_2019'] = datetime(2019, 1, 1) - average_job_days['last_starting_time']

average_job_days

Unnamed: 0_level_0,first_starting_time,last_starting_time,_2019_first_diff,days_until_2019
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2005-09-01,2005-09-01,4870 days,4870 days
2,2016-12-01,2018-06-01,761 days,214 days
5,2017-06-01,2017-06-01,579 days,579 days
7,2016-07-01,2018-12-01,914 days,31 days
10,2013-08-01,2018-05-01,1979 days,245 days
...,...,...,...,...
66269,2004-04-01,2018-12-01,5388 days,31 days
66270,2014-11-01,2018-12-01,1522 days,31 days
66271,2001-11-01,2016-11-01,6270 days,791 days
66272,2012-07-01,2018-12-01,2375 days,31 days


__<font color = "darkgreen">A new feature that keeps how many days a user stay on a job by (total days worked / total jobs added), averagely. </font>__

__<font color = "darkgreen">By using days left to 2019;</font>__

* __<font color = "darkgreen">Firstly I will calculate (average staying days - days left to 2019), which means roughly how many days on average a person stays in a job after 2019.</font>__
* __<font color = "darkgreen">After calculating it, we mark a new feature indicating whether this number is less than 365 or not. If it is less than 365, we mark it as 1, otherwise as 0. In other words, we can say that if the average length of time spent at work after entering 2019 is less than 365 days, the user is likely to change jobs, and this feature captures that likelihood.
</font>__

In [20]:
average_job_days._2019_first_diff = average_job_days._2019_first_diff.dt.days
average_job_days.days_until_2019 = average_job_days.days_until_2019.dt.days

average_job_days = average_job_days.join(job_count)

average_job_days['average_staying_days'] = average_job_days._2019_first_diff / average_job_days.job_count

average_job_days['may_move'] = np.where((average_job_days['average_staying_days'] - average_job_days['days_until_2019']) < 365, 1, 0)

average_job_days['first_starting_time'] = exp.groupby('user_id')['start_year_month'].min()
average_job_days['last_starting_time'] = exp.groupby('user_id')['start_year_month'].max()

average_job_days.reset_index(inplace=True)
average_job_days.drop(columns=['last_starting_time'],axis=1,inplace=True)
average_job_days

Unnamed: 0,user_id,first_starting_time,_2019_first_diff,days_until_2019,job_count,average_staying_days,may_move
0,0,200509,4870,4870,1,4870.000000,1
1,2,201612,761,214,3,253.666667,1
2,5,201706,579,579,1,579.000000,1
3,7,201607,914,31,4,228.500000,1
4,10,201308,1979,245,2,989.500000,0
...,...,...,...,...,...,...,...
52997,66269,200404,5388,31,5,1077.600000,0
52998,66270,201411,1522,31,2,761.000000,0
52999,66271,200111,6270,791,9,696.666667,1
53000,66272,201207,2375,31,5,475.000000,0


In [21]:
train = train.merge(average_job_days,how='left',on='user_id')
test = test.merge(average_job_days,how='left',on='user_id')

***
# <font color = "royalblue">Skills</font>
***

__<font color = "darkgreen">Number of skills added to Linkedin</font>__

In [22]:
skill.drop_duplicates(inplace=True)

skill_count = skill.groupby('user_id').agg({ 'skill' : ['count']})
skill_count.columns = ['_'.join(col) for col in skill_count.columns]
skill_count

Unnamed: 0_level_0,skill_count
user_id,Unnamed: 1_level_1
1,2
2,6
3,3
5,11
6,6
...,...
66269,31
66270,8
66271,47
66272,36


In [23]:
skill.drop(columns=['skill_binned','skill_null','has_any_top_skill'],axis=1,inplace=True)

skill.rename({'is_2018_top_skill' : 'has_any_2018_top_skill',
              'is_2019_top_skill' : 'has_any_2019_top_skill',
              'is_2020_top_skill' : 'has_any_2020_top_skill',
             },axis=1,inplace=True)

all_skills = list(skill.skill.value_counts().head(200).index)

all_skills

['Java',
 'JavaScript',
 'SQL',
 'C#',
 'HTML',
 'CSS',
 'C++',
 'Microsoft SQL Server',
 'C',
 'Microsoft Office',
 'Python',
 'MySQL',
 'Go',
 'Linux',
 'Software Development',
 'HTML5',
 'jQuery',
 'ASP.NET',
 'English',
 '.NET',
 'ASP.NET MVC',
 'PHP',
 'Web Development',
 'Software development',
 'OOP (Object Oriented Programming)',
 'Visual Studio',
 'matlab',
 'XML',
 'React.js',
 'Microsoft Excel',
 'Web Services',
 'scrum',
 'engineering',
 'JSON',
 'Android',
 'PL/SQL',
 'Project Management',
 'Spring Framework',
 'Management',
 'JIRA',
 'PostgreSQL',
 'Project management',
 'Software Engineering',
 'Android Development',
 'OOP',
 'hibernate',
 'Machine Learning',
 'Object Oriented Programming (OOP)',
 'Eclipse',
 'AJAX',
 'Web Applications',
 'Node.js',
 'T-SQL',
 'Photoshop',
 'Object Oriented Design',
 'Teamwork',
 'Docker',
 'MongoDB',
 'Agile Methodologies',
 'Microsoft Word',
 'databases',
 'AutoCAD',
 'Software Project Management',
 'Entity Framework',
 'Algorithms',
 

__<font color = "darkgreen">Converting the first 200 skills to dummy variables</font>__

In [24]:
dummies = pd.get_dummies(skill.skill.apply(lambda x: x if x in all_skills else 'other'))

skill_first200 = pd.concat([skill, dummies], axis=1)

skill_first200

Unnamed: 0,user_id,skill,has_any_2018_top_skill,has_any_2019_top_skill,has_any_2020_top_skill,.NET,.NET Core,.NET Framework,ADO.NET,AJAX,ASP.NET,ASP.NET MVC,Active Directory,Agile Methodologies,Agile Methods,Agile Project Management,Algorithms,Amazon Web Services (AWS),Analytical Skills,Android,Android Application Development,Android Development,Angular,AngularJS,Arduino,Artificial intelligence,AutoCAD,Back-End Web Development,Bootstrap,Business Analysis,Business Development,Business Intelligence,Business Strategy,C,C (Programming Language),C#,C++,CRM,CSS,CSS3,Cascading Style Sheets (CSS),Cisco Technologies,Cloud Computing,Communication,Computer Science,Data Analysis,Data Science,Data Structures,Data analysis,Database Design,Design Patterns,DevOps,Django,Docker,Eclipse,Embedded Systems,English,Entity Framework,Firewalls,Front-end Development,Game Development,Go,HANDLE,HTML,HTML5,IIS,Information Technology,JIRA,JSF,JSON,JSP,JUnit,Java,Java Enterprise Edition,JavaScript,Jenkins,Kotlin,Kubernetes,LINQ,Laravel,Leadership,Linux,MVC,Machine Learning,Management,Microsoft Excel,Microsoft Office,Microsoft SQL Server,Microsoft Word,Mobile Application Development,Mobile Applications,Mobile Apps,MongoDB,MySQL,Network Security,NoSQL,Node.js,OOP,OOP (Object Oriented Programming),Object Oriented Design,Object Oriented Programming (OOP),Objective-C,Oracle,Oracle Database,Oracle SQL Developer,PHP,PL/SQL,Photoshop,PostgreSQL,PowerPoint,Problem Solving,Product Development,Programming,Project Management,Project Planning,Project management,Public Speaking,Python,Python (Programming Language),R&D,REST (Presentational State Transfer),RESTful WebServices,ROPE,RabbitMQ,React Native,React.js,Requirements Analysis,Research,SOA,SOAP,SQL,Sales,Social media,Software,Software Design,Software Development,Software Engineering,Software Project Management,Software Testing,Software development,SolidWorks,Solve problem,Spring Boot,Spring Framework,Strategic Planning,Strategy,T-SQL,TCP/IP,TFS,Team Leadership,Team Management,Teamwork,Telecommunications,Test Automation,Tomcat,Troubleshooting,TypeScript,UML,Ubuntu,Unity,Unix,VMware,Virtualization,Visual Basic,Visual Studio,Vue.js,WCF,WPF,Web Applications,Web Design,Web Development,Web Services,Web designing,Windows Server,WordPress,XML,bash,databases,engineering,firebase,flutter,github,hibernate,iOS,iOS Development,integration,jQuery,leadership,matlab,maven,microservices,networking,other,programming,redis,research,routing,scrum,selenium,simulink,spring,subversion,swift,teamwork,testing,windows
0,1,engineering,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Education,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,Android,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,Java,1.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,3D Studio Max,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1398438,66273,CI/CD,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1398439,66273,Terraform,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1398440,66273,MongoDB,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1398441,66273,Go (Programming Language),0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
skill_first200.drop(['skill','has_any_2018_top_skill','has_any_2019_top_skill','has_any_2020_top_skill'],axis=1,inplace=True)

skill_first200 = skill_first200.groupby('user_id').max()

skill_first200.reset_index(inplace=True)

skill_first200

Unnamed: 0,user_id,.NET,.NET Core,.NET Framework,ADO.NET,AJAX,ASP.NET,ASP.NET MVC,Active Directory,Agile Methodologies,Agile Methods,Agile Project Management,Algorithms,Amazon Web Services (AWS),Analytical Skills,Android,Android Application Development,Android Development,Angular,AngularJS,Arduino,Artificial intelligence,AutoCAD,Back-End Web Development,Bootstrap,Business Analysis,Business Development,Business Intelligence,Business Strategy,C,C (Programming Language),C#,C++,CRM,CSS,CSS3,Cascading Style Sheets (CSS),Cisco Technologies,Cloud Computing,Communication,Computer Science,Data Analysis,Data Science,Data Structures,Data analysis,Database Design,Design Patterns,DevOps,Django,Docker,Eclipse,Embedded Systems,English,Entity Framework,Firewalls,Front-end Development,Game Development,Go,HANDLE,HTML,HTML5,IIS,Information Technology,JIRA,JSF,JSON,JSP,JUnit,Java,Java Enterprise Edition,JavaScript,Jenkins,Kotlin,Kubernetes,LINQ,Laravel,Leadership,Linux,MVC,Machine Learning,Management,Microsoft Excel,Microsoft Office,Microsoft SQL Server,Microsoft Word,Mobile Application Development,Mobile Applications,Mobile Apps,MongoDB,MySQL,Network Security,NoSQL,Node.js,OOP,OOP (Object Oriented Programming),Object Oriented Design,Object Oriented Programming (OOP),Objective-C,Oracle,Oracle Database,Oracle SQL Developer,PHP,PL/SQL,Photoshop,PostgreSQL,PowerPoint,Problem Solving,Product Development,Programming,Project Management,Project Planning,Project management,Public Speaking,Python,Python (Programming Language),R&D,REST (Presentational State Transfer),RESTful WebServices,ROPE,RabbitMQ,React Native,React.js,Requirements Analysis,Research,SOA,SOAP,SQL,Sales,Social media,Software,Software Design,Software Development,Software Engineering,Software Project Management,Software Testing,Software development,SolidWorks,Solve problem,Spring Boot,Spring Framework,Strategic Planning,Strategy,T-SQL,TCP/IP,TFS,Team Leadership,Team Management,Teamwork,Telecommunications,Test Automation,Tomcat,Troubleshooting,TypeScript,UML,Ubuntu,Unity,Unix,VMware,Virtualization,Visual Basic,Visual Studio,Vue.js,WCF,WPF,Web Applications,Web Design,Web Development,Web Services,Web designing,Windows Server,WordPress,XML,bash,databases,engineering,firebase,flutter,github,hibernate,iOS,iOS Development,integration,jQuery,leadership,matlab,maven,microservices,networking,other,programming,redis,research,routing,scrum,selenium,simulink,spring,subversion,swift,teamwork,testing,windows
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62397,66269,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
62398,66270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
62399,66271,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
62400,66272,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0


In [26]:
train = train.merge(skill_first200,how='left',on='user_id')
test = test.merge(skill_first200,how='left',on='user_id')

__<font color = "darkgreen">Sum of skills in-demand owned by a user</font>__

In [27]:
top_skills = skill[['user_id','has_any_2018_top_skill','has_any_2019_top_skill','has_any_2020_top_skill']]
top_skills

top_skills = top_skills.groupby('user_id').sum()
top_skills.rename(columns={'has_any_2018_top_skill' : '2018_top_skill_sum',
                           'has_any_2019_top_skill' : '2019_top_skill_sum',
                          'has_any_2020_top_skill' : '2020_top_skill_sum'},inplace=True)

top_skills['top_skill_sum'] = top_skills['2018_top_skill_sum'] + top_skills['2019_top_skill_sum'] + top_skills['2020_top_skill_sum']

skill = top_skills.join(skill_count)
skill.reset_index(inplace=True)
skill

Unnamed: 0,user_id,2018_top_skill_sum,2019_top_skill_sum,2020_top_skill_sum,top_skill_sum,skill_count
0,1,0.0,0.0,0.0,0.0,2
1,2,1.0,0.0,0.0,1.0,6
2,3,0.0,0.0,0.0,0.0,3
3,5,2.0,1.0,0.0,3.0,11
4,6,2.0,0.0,0.0,2.0,6
...,...,...,...,...,...,...
62397,66269,2.0,2.0,2.0,6.0,31
62398,66270,0.0,0.0,0.0,0.0,8
62399,66271,3.0,0.0,0.0,3.0,47
62400,66272,2.0,0.0,0.0,2.0,36


In [28]:
train = train.merge(skill,how='left',on='user_id')
test = test.merge(skill,how='left',on='user_id')

***
# <font color = "royalblue">Languages</font>
***

In [29]:
lang.drop_duplicates(inplace=True)

lang.drop(['language','proficiency_null'],axis=1,inplace=True)

lang_count = lang.groupby('user_id').count()
lang_count.drop(['language_binned'],axis=1,inplace=True)
lang_count.rename({'proficiency' : 'language_count'},axis=1,inplace=True)
lang_count

Unnamed: 0_level_0,language_count
user_id,Unnamed: 1_level_1
8,3
10,1
11,3
12,2
13,2
...,...
66265,1
66269,1
66271,1
66272,1


In [30]:
lang.proficiency.value_counts()

native_or_bilingual     22025
professional_working    18304
Unknown                 10659
elementary               8971
full_professional        8387
limited_working          7709
Name: proficiency, dtype: int64

In [31]:
lang.proficiency = lang.proficiency.map({'Unknown' : 0, 'elementary' : 1, 'limited_working' : 2, 'professional_working' : 3, 
                      'full_professional' : 4, 'native_or_bilingual' : 5})

all_lang = list(lang.language_binned.value_counts().head(22).index)

all_lang

['English',
 'Turkish',
 'German',
 'French',
 'Spanish',
 'Russian',
 'Arabic',
 'Italian',
 'Japanese',
 'Azerbaijani',
 'Deutsch',
 'Chinese',
 'Ottoman',
 'Korean',
 'Persian',
 'Kurdish',
 'Bulgarian',
 'Polish',
 'Sign Languages',
 'Dutch',
 'Greek',
 'Portuguese']

__<font color = "darkgreen">Conversion of all languages to dummy variables</font>__

In [32]:
dummies = pd.get_dummies(lang.language_binned.apply(lambda x: x if x in all_lang else 'other'))

all_lang_ = pd.concat([lang, dummies], axis=1)

all_lang_.drop({'other'},axis=1,inplace=True)

all_lang_

Unnamed: 0,user_id,proficiency,language_binned,Arabic,Azerbaijani,Bulgarian,Chinese,Deutsch,Dutch,English,French,German,Greek,Italian,Japanese,Korean,Kurdish,Ottoman,Persian,Polish,Portuguese,Russian,Sign Languages,Spanish,Turkish
0,8,4,English,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,8,5,Turkish,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,8,1,French,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10,0,English,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,11,5,Turkish,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76057,66271,0,English,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
76058,66272,3,English,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
76059,66273,5,Turkish,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
76060,66273,3,English,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


__<font color = "darkgreen">Multiplying proficiencies and language skills possessed by a user to obtain a combined language proficiency feature </font>__

In [33]:
cols = list(all_lang_.columns[3:])

for i in range(len(cols)):
    col = cols[i]
    all_lang_[col] = all_lang_[col].astype(int) * all_lang_.proficiency

all_lang_ = all_lang_.groupby('user_id').max()

all_lang_.drop(['proficiency', 'language_binned'],axis=1,inplace=True)

all_lang_.reset_index(inplace=True)

all_lang_

Unnamed: 0,user_id,Arabic,Azerbaijani,Bulgarian,Chinese,Deutsch,Dutch,English,French,German,Greek,Italian,Japanese,Korean,Kurdish,Ottoman,Persian,Polish,Portuguese,Russian,Sign Languages,Spanish,Turkish
0,8,0,0,0,0,0,0,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5
1,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,11,0,0,0,0,0,0,3,0,0,0,0,0,0,5,0,0,0,0,0,0,0,5
3,12,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5
4,13,0,0,0,0,0,0,4,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37285,66265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37286,66269,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37287,66271,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37288,66272,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
train = train.merge(all_lang_,how='left',on='user_id')
test = test.merge(all_lang_,how='left',on='user_id')

## <font color = "royalblue">External Data </font>
__<font color = "darkgreen">By using in-demand languages of</font>__ [__2018__](https://www.linkedin.com/pulse/7-most-useful-languages-learn-2018-nikola-gizarovski/), [__2019__](https://www.linkedin.com/pulse/top-5-internet-languages-2019-matthew-nelson/), [__2020__](https://www.linkedin.com/pulse/15-best-languages-learn-2020-ofer-tirosh/) __<font color = "darkgreen"> shared on Linkedin, similar to what I have done with skills data, I will use this in-demand languages because in my opinion they might affect changing jobs.</font>__

In [35]:
top_lang_2018 = {'English' : 1,'French' : 1, 'Spanish' : 1, 'Italian' : 1, 'Chinese' : 1, 
                 'Portuguese' : 1, 'Arabic' : 1,'Russian' : 1,'German' : 1}

top_lang_2019 = {'English' : 1, 'Spanish' : 1, 'Chinese' : 1, 'Portuguese' : 1, 'Arabic' : 1}

top_lang_2020 = {'English' : 1, 'Chinese' : 1, 'Spanish' : 1, 'Japanese' : 1, 'Russian' : 1, 'Norwegian' : 1,
                 'Swedish' : 1,'Italian' : 1,'French' : 1,'Portuguese' : 1,
                'German' : 1, 'Arabic' : 1, 'Amharic' : 1, 'Hindi' : 1, 'Korean' : 1
                }


lang['2018_top_lang'] = lang.language_binned.map(top_lang_2018).fillna(0)
lang['2019_top_lang'] = lang.language_binned.map(top_lang_2019).fillna(0)
lang['2020_top_lang'] = lang.language_binned.map(top_lang_2020).fillna(0)

lang_top = lang.groupby('user_id').sum()
lang_top.drop(['proficiency'],axis=1,inplace=True)
lang_top

Unnamed: 0_level_0,2018_top_lang,2019_top_lang,2020_top_lang
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,2.0,1.0,2.0
10,1.0,1.0,1.0
11,1.0,1.0,1.0
12,1.0,1.0,1.0
13,2.0,1.0,2.0
...,...,...,...
66265,1.0,1.0,1.0
66269,1.0,1.0,1.0
66271,1.0,1.0,1.0
66272,1.0,1.0,1.0


__<font color = "darkgreen">Number of added language, proficiency points sum and language points</font>__ 

In [36]:
lang = lang[['user_id','proficiency']]

proficiency_sum = lang.groupby('user_id').sum()
proficiency_sum.rename(columns={'proficiency' : 'proficiency_sum'},inplace=True)

lang_ = lang_count.join(proficiency_sum)
lang_['language_points'] = lang_.language_count * lang_.proficiency_sum
lang_

Unnamed: 0_level_0,language_count,proficiency_sum,language_points
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8,3,10,30
10,1,0,0
11,3,13,39
12,2,8,16
13,2,6,12
...,...,...,...
66265,1,0,0
66269,1,0,0
66271,1,0,0
66272,1,3,3


In [37]:
lang = lang_.join(lang_top)
lang.reset_index(inplace=True)
lang

Unnamed: 0,user_id,language_count,proficiency_sum,language_points,2018_top_lang,2019_top_lang,2020_top_lang
0,8,3,10,30,2.0,1.0,2.0
1,10,1,0,0,1.0,1.0,1.0
2,11,3,13,39,1.0,1.0,1.0
3,12,2,8,16,1.0,1.0,1.0
4,13,2,6,12,2.0,1.0,2.0
...,...,...,...,...,...,...,...
37285,66265,1,0,0,1.0,1.0,1.0
37286,66269,1,0,0,1.0,1.0,1.0
37287,66271,1,0,0,1.0,1.0,1.0
37288,66272,1,3,3,1.0,1.0,1.0


In [38]:
train = train.merge(lang,how='left',on='user_id')
test = test.merge(lang,how='left',on='user_id')

***
# <font color = "royalblue">Train - Test</font>
***

__<font color = "darkgreen">The comparison of the location info in the train - test sets and work experience set, by that a new feature that holds the value 1 if they are different and else 0, indicating that the latest job could be remote.</font>__

In [39]:
train.rename({'work_city' : 'city','work_country' : 'country'},axis=1,inplace=True)
test.rename({'work_city' : 'city','work_country' : 'country'},axis=1,inplace=True)

train['may_be_remote'] = np.where((pd.isnull(train.last_work_city) & pd.isnull(train.last_work_country)),np.nan,np.where((train.city + train.country) == (train.last_work_city + train.last_work_country), 0, 1))
test['may_be_remote'] = np.where((pd.isnull(test.last_work_city) & pd.isnull(test.last_work_country)),np.nan,np.where((test.city + test.country) == (test.last_work_city + test.last_work_country), 0, 1))


train

Unnamed: 0,user_id,industry,city,moved_after_2019,country,school_last,school_previous,degree_last,degree_previous,fos_last,fos_previous,years_of_study,school_count,degree_sum,academic_points,fos_change,school_change,degree_change,last_company_id,last_work_city,last_start_year_month,last_work_country,previous_company_id,previous_work_city,previous_start_year_month,previous_work_country,last_company_size,previous_company_size,company_id_change,work_city-country_change,company_size_increase,previous_job_days,first_starting_time,_2019_first_diff,days_until_2019,job_count,average_staying_days,may_move,.NET,.NET Core,.NET Framework,ADO.NET,AJAX,ASP.NET,ASP.NET MVC,Active Directory,Agile Methodologies,Agile Methods,Agile Project Management,Algorithms,Amazon Web Services (AWS),Analytical Skills,Android,Android Application Development,Android Development,Angular,AngularJS,Arduino,Artificial intelligence,AutoCAD,Back-End Web Development,Bootstrap,Business Analysis,Business Development,Business Intelligence,Business Strategy,C,C (Programming Language),C#,C++,CRM,CSS,CSS3,Cascading Style Sheets (CSS),Cisco Technologies,Cloud Computing,Communication,Computer Science,Data Analysis,Data Science,Data Structures,Data analysis,Database Design,Design Patterns,DevOps,Django,Docker,Eclipse,Embedded Systems,English_x,Entity Framework,Firewalls,Front-end Development,Game Development,Go,HANDLE,HTML,HTML5,IIS,Information Technology,JIRA,JSF,JSON,JSP,JUnit,Java,Java Enterprise Edition,JavaScript,Jenkins,Kotlin,Kubernetes,LINQ,Laravel,Leadership,Linux,MVC,Machine Learning,Management,Microsoft Excel,Microsoft Office,Microsoft SQL Server,Microsoft Word,Mobile Application Development,Mobile Applications,Mobile Apps,MongoDB,MySQL,Network Security,NoSQL,Node.js,OOP,OOP (Object Oriented Programming),Object Oriented Design,Object Oriented Programming (OOP),Objective-C,Oracle,Oracle Database,Oracle SQL Developer,PHP,PL/SQL,Photoshop,PostgreSQL,PowerPoint,Problem Solving,Product Development,Programming,Project Management,Project Planning,Project management,Public Speaking,Python,Python (Programming Language),R&D,REST (Presentational State Transfer),RESTful WebServices,ROPE,RabbitMQ,React Native,React.js,Requirements Analysis,Research,SOA,SOAP,SQL,Sales,Social media,Software,Software Design,Software Development,Software Engineering,Software Project Management,Software Testing,Software development,SolidWorks,Solve problem,Spring Boot,Spring Framework,Strategic Planning,Strategy,T-SQL,TCP/IP,TFS,Team Leadership,Team Management,Teamwork,Telecommunications,Test Automation,Tomcat,Troubleshooting,TypeScript,UML,Ubuntu,Unity,Unix,VMware,Virtualization,Visual Basic,Visual Studio,Vue.js,WCF,WPF,Web Applications,Web Design,Web Development,Web Services,Web designing,Windows Server,WordPress,XML,bash,databases,engineering,firebase,flutter,github,hibernate,iOS,iOS Development,integration,jQuery,leadership,matlab,maven,microservices,networking,other,programming,redis,research,routing,scrum,selenium,simulink,spring,subversion,swift,teamwork,testing,windows,2018_top_skill_sum,2019_top_skill_sum,2020_top_skill_sum,top_skill_sum,skill_count,Arabic,Azerbaijani,Bulgarian,Chinese,Deutsch,Dutch,English_y,French,German,Greek,Italian,Japanese,Korean,Kurdish,Ottoman,Persian,Polish,Portuguese,Russian,Sign Languages,Spanish,Turkish,language_count,proficiency_sum,language_points,2018_top_lang,2019_top_lang,2020_top_lang,may_be_remote
0,1301,Information Technology and Services,İstanbul,1,Turkey,Bahcesehir University,non valid or other,bachelor,associate,software engineering,computer programming,6.0,2.0,7.0,14.0,1.0,1.0,1.0,26.0,Foreign or Non-Valid,201803.0,Unknown or Non-Valid,1875.0,İzmir,201510.0,Turkey,2060.0,2.0,1.0,1.0,1.0,882.0,201505.0,1341.0,306.0,3.0,447.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,6.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,8.0,16.0,1.0,1.0,1.0,1.0
1,6950,Internet,İstanbul,0,Turkey,Istanbul University,,non valid or unknown,,computer engineering,,0.0,1.0,0.0,0.0,,,,1337.0,İstanbul,201702.0,Turkey,1337.0,Foreign or Non-Valid,201608.0,Unknown or Non-Valid,88.0,88.0,0.0,1.0,0.0,184.0,201304.0,2101.0,699.0,6.0,350.166667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,9.0,2.0,1.0,3.0,0.0
2,4880,Online Media,Turkey,0,Turkey,Marmara University,High School,bachelor,non valid or unknown,computer teaching,computer software,4.0,2.0,4.0,8.0,1.0,1.0,1.0,4366.0,İstanbul,201705.0,Turkey,5291.0,Foreign or Non-Valid,201608.0,Unknown or Non-Valid,4.0,16.0,1.0,1.0,0.0,273.0,201401.0,1826.0,610.0,4.0,456.500000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
3,26046,Telecommunications,İstanbul,0,Turkey,non valid or other,Yeditepe University,bachelor,bachelor,non valid or unknown,non valid or unknown,8.0,3.0,8.0,24.0,0.0,1.0,0.0,26.0,İstanbul,201410.0,Turkey,26.0,Foreign or Non-Valid,201212.0,Unknown or Non-Valid,2060.0,2060.0,0.0,1.0,0.0,669.0,200909.0,3409.0,1553.0,4.0,852.250000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,11005,Banking,İstanbul,0,Turkey,Bahcesehir University,Kadir Has University,non valid or unknown,non valid or unknown,non valid or unknown,industrial engineering,0.0,2.0,0.0,0.0,1.0,1.0,0.0,1562.0,İstanbul,201704.0,Turkey,7191.0,Foreign or Non-Valid,201008.0,Unknown or Non-Valid,887.0,30.0,1.0,1.0,1.0,2435.0,200909.0,3409.0,640.0,3.0,1136.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,8.0,16.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53014,57247,Information Technology and Services,İstanbul,0,Turkey,Sakarya University,,bachelor,,non valid or unknown,,4.0,1.0,4.0,4.0,,,,268.0,İstanbul,201811.0,Turkey,1390.0,İstanbul,201511.0,Turkey,593.0,238.0,1.0,0.0,1.0,1096.0,200704.0,4293.0,61.0,3.0,1431.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,26.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0
53015,37784,Telecommunications,İstanbul,0,Turkey,Koc University,Anadolu University,non valid or unknown,bachelor,non valid or unknown,economics,8.0,3.0,8.0,24.0,1.0,1.0,1.0,41.0,İstanbul,201408.0,Turkey,709.0,İstanbul,201307.0,Turkey,2162.0,574.0,1.0,0.0,1.0,396.0,201005.0,3167.0,1614.0,3.0,1055.666667,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,23.0,138.0,1.0,1.0,1.0,0.0
53016,33229,Information Technology and Services,Ankara,0,Turkey,Suleyman Demirel University,non valid or other,bachelor,bachelor,non valid or unknown,computer engineering,8.0,2.0,8.0,16.0,1.0,1.0,0.0,1201.0,Ankara,201806.0,Turkey,4359.0,Ankara,201608.0,Turkey,67.0,28.0,1.0,0.0,1.0,669.0,201002.0,3256.0,214.0,3.0,1085.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0
53017,12165,Internet,İstanbul,1,Turkey,Mugla Sitki Kocman University,Sakarya University,master,bachelor,non valid or unknown,computer science,14.0,4.0,15.0,60.0,1.0,1.0,1.0,10197.0,Kocaeli,201808.0,Turkey,10196.0,Sakarya,201712.0,Turkey,10.0,2.0,1.0,1.0,1.0,243.0,201707.0,549.0,153.0,3.0,183.000000,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,1.0,0.0,5.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,9.0,27.0,2.0,1.0,2.0,1.0


In [40]:
test

Unnamed: 0,user_id,industry,city,country,school_last,school_previous,degree_last,degree_previous,fos_last,fos_previous,years_of_study,school_count,degree_sum,academic_points,fos_change,school_change,degree_change,last_company_id,last_work_city,last_start_year_month,last_work_country,previous_company_id,previous_work_city,previous_start_year_month,previous_work_country,last_company_size,previous_company_size,company_id_change,work_city-country_change,company_size_increase,previous_job_days,first_starting_time,_2019_first_diff,days_until_2019,job_count,average_staying_days,may_move,.NET,.NET Core,.NET Framework,ADO.NET,AJAX,ASP.NET,ASP.NET MVC,Active Directory,Agile Methodologies,Agile Methods,Agile Project Management,Algorithms,Amazon Web Services (AWS),Analytical Skills,Android,Android Application Development,Android Development,Angular,AngularJS,Arduino,Artificial intelligence,AutoCAD,Back-End Web Development,Bootstrap,Business Analysis,Business Development,Business Intelligence,Business Strategy,C,C (Programming Language),C#,C++,CRM,CSS,CSS3,Cascading Style Sheets (CSS),Cisco Technologies,Cloud Computing,Communication,Computer Science,Data Analysis,Data Science,Data Structures,Data analysis,Database Design,Design Patterns,DevOps,Django,Docker,Eclipse,Embedded Systems,English_x,Entity Framework,Firewalls,Front-end Development,Game Development,Go,HANDLE,HTML,HTML5,IIS,Information Technology,JIRA,JSF,JSON,JSP,JUnit,Java,Java Enterprise Edition,JavaScript,Jenkins,Kotlin,Kubernetes,LINQ,Laravel,Leadership,Linux,MVC,Machine Learning,Management,Microsoft Excel,Microsoft Office,Microsoft SQL Server,Microsoft Word,Mobile Application Development,Mobile Applications,Mobile Apps,MongoDB,MySQL,Network Security,NoSQL,Node.js,OOP,OOP (Object Oriented Programming),Object Oriented Design,Object Oriented Programming (OOP),Objective-C,Oracle,Oracle Database,Oracle SQL Developer,PHP,PL/SQL,Photoshop,PostgreSQL,PowerPoint,Problem Solving,Product Development,Programming,Project Management,Project Planning,Project management,Public Speaking,Python,Python (Programming Language),R&D,REST (Presentational State Transfer),RESTful WebServices,ROPE,RabbitMQ,React Native,React.js,Requirements Analysis,Research,SOA,SOAP,SQL,Sales,Social media,Software,Software Design,Software Development,Software Engineering,Software Project Management,Software Testing,Software development,SolidWorks,Solve problem,Spring Boot,Spring Framework,Strategic Planning,Strategy,T-SQL,TCP/IP,TFS,Team Leadership,Team Management,Teamwork,Telecommunications,Test Automation,Tomcat,Troubleshooting,TypeScript,UML,Ubuntu,Unity,Unix,VMware,Virtualization,Visual Basic,Visual Studio,Vue.js,WCF,WPF,Web Applications,Web Design,Web Development,Web Services,Web designing,Windows Server,WordPress,XML,bash,databases,engineering,firebase,flutter,github,hibernate,iOS,iOS Development,integration,jQuery,leadership,matlab,maven,microservices,networking,other,programming,redis,research,routing,scrum,selenium,simulink,spring,subversion,swift,teamwork,testing,windows,2018_top_skill_sum,2019_top_skill_sum,2020_top_skill_sum,top_skill_sum,skill_count,Arabic,Azerbaijani,Bulgarian,Chinese,Deutsch,Dutch,English_y,French,German,Greek,Italian,Japanese,Korean,Kurdish,Ottoman,Persian,Polish,Portuguese,Russian,Sign Languages,Spanish,Turkish,language_count,proficiency_sum,language_points,2018_top_lang,2019_top_lang,2020_top_lang,may_be_remote
0,17449,Research,Turkey,Turkey,non valid or other,Anadolu University,phd,non valid or unknown,non valid or unknown,business / managerial economics,4.0,6.0,6.0,36.0,1.0,1.0,1.0,125.0,Foreign or Non-Valid,201201.0,Unknown or Non-Valid,12566.0,Foreign or Non-Valid,200101.0,Turkey,125.0,27.0,1.0,1.0,1.0,4017.0,200101.0,6574.0,2557.0,3.0,2191.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,34.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
1,33967,Computer Software,İstanbul,Turkey,non valid or other,non valid or other,high school,bachelor,non valid or unknown,non valid or unknown,12.0,4.0,10.0,40.0,0.0,0.0,1.0,26.0,İstanbul,201806.0,Turkey,19177.0,Foreign or Non-Valid,201606.0,Unknown or Non-Valid,2060.0,1.0,1.0,1.0,1.0,730.0,201606.0,944.0,214.0,2.0,472.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,5.0,31.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,14.0,56.0,3.0,2.0,3.0,0.0
2,2110,Automotive,Turkey,Turkey,Beykent University,,bachelor,,non valid or unknown,,4.0,1.0,4.0,4.0,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,4.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,8.0,24.0,2.0,1.0,2.0,
3,55082,Internet,Turkey,Turkey,Anadolu University,Selcuk University,non valid or unknown,non valid or unknown,business,non valid or unknown,0.0,2.0,0.0,0.0,1.0,1.0,0.0,7228.0,Remote,201702.0,Remote,5691.0,İstanbul,201108.0,Turkey,5.0,32.0,1.0,1.0,0.0,2011.0,201108.0,2710.0,699.0,2.0,1355.000000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0
4,37165,Electrical/Electronic Manufacturing,Turkey,Turkey,Yildiz Technical University,High School,non valid or unknown,non valid or unknown,electrical engineering,non valid or unknown,0.0,2.0,0.0,0.0,1.0,1.0,0.0,4213.0,Giresun,201606.0,Turkey,,,,,3.0,,,,,,201606.0,944.0,944.0,1.0,944.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,9.0,18.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13250,32847,Computer Software,Turkey,Turkey,Suleyman Demirel University,,non valid or unknown,,computer engineering,,0.0,1.0,0.0,0.0,,,,2143.0,İstanbul,201607.0,Turkey,,,,,52.0,,,,,,201607.0,914.0,914.0,1.0,914.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
13251,20054,Hospital & Health Care,Turkey,Turkey,Istanbul University,Istanbul University,bachelor,non valid or unknown,non valid or unknown,non valid or unknown,4.0,2.0,4.0,8.0,0.0,0.0,1.0,14463.0,Foreign or Non-Valid,201604.0,Unknown or Non-Valid,5273.0,Foreign or Non-Valid,201004.0,Unknown or Non-Valid,13.0,111.0,1.0,0.0,0.0,2192.0,200509.0,4870.0,1005.0,3.0,1623.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,8.0,2.0,1.0,2.0,1.0
13252,7029,Wireless,İstanbul,Turkey,Koc University,Koc University,master,bachelor,non valid or unknown,electrical and electronics engineering,10.0,3.0,11.0,33.0,1.0,0.0,1.0,4913.0,Foreign or Non-Valid,201809.0,Unknown or Non-Valid,7372.0,Foreign or Non-Valid,201807.0,Unknown or Non-Valid,302.0,9.0,1.0,0.0,1.0,62.0,201807.0,184.0,122.0,2.0,92.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,9.0,27.0,2.0,1.0,2.0,1.0
13253,56130,Information Technology and Services,Ankara,Turkey,Selcuk University,,bachelor,,computer engineering,,4.0,1.0,4.0,4.0,,,,34.0,Ankara,201709.0,Turkey,177.0,Ankara,201706.0,Turkey,831.0,55.0,1.0,0.0,1.0,92.0,201406.0,1675.0,487.0,3.0,558.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,8.0,16.0,1.0,1.0,1.0,0.0


***
# <font color = "royalblue">Imputation - Encoding</font>

> __<font color = "darkgreen">While at the beginning, in order to preserve information about missing data, I filled them with -1, and even went as far as to manually detect missing values for each column and fill them with different numbers depending on the case, for example, according to the difference between those who have no work experience and those who have at least one work experience. However, models generated as a result of this approach had lower cross-validation scores and lower ability to predict positive class compared to models generated by directly filling columns with mode. And they also received lower scores on the leaderboard compared to models with mode filled datasets. As a result, I gave up on my desire to preserve the incompleteness of the data and filled in the dataset with the mode of each feature.</font>__
***

In [41]:
train.isna().sum()

user_id                                     0
industry                                   72
city                                        0
moved_after_2019                            0
country                                     0
school_last                                 1
school_previous                         17315
degree_last                                 1
degree_previous                         17315
fos_last                                    1
fos_previous                            17315
years_of_study                              1
school_count                                1
degree_sum                                  1
academic_points                             1
fos_change                              17315
school_change                           17315
degree_change                           17315
last_company_id                         10543
last_work_city                          10543
last_start_year_month                   10543
last_work_country                 

In [42]:
train_null_cols = train.isna().sum().index[train.isna().sum().values > 0]
test_null_cols = test.isna().sum().index[test.isna().sum().values > 0]

cat_features = train.select_dtypes(include=['object']).columns
cat_features

Index(['industry', 'city', 'country', 'school_last', 'school_previous',
       'degree_last', 'degree_previous', 'fos_last', 'fos_previous',
       'last_work_city', 'last_work_country', 'previous_work_city',
       'previous_work_country'],
      dtype='object')

In [43]:
for i in range(len(train_null_cols)):
    col = train_null_cols[i]
    train[col] = train[col].fillna(train[col].mode()[0])
    
for i in range(len(test_null_cols)):
    col = test_null_cols[i]
    test[col] = test[col].fillna(test[col].mode()[0])  
    
le = LabelEncoder()
    
for col in cat_features:
   
    train[col] = le.fit_transform( train[col])
    
for col in cat_features:
   
    test[col] = le.fit_transform( test[col])

In [44]:
train.isna().sum()

user_id                                 0
industry                                0
city                                    0
moved_after_2019                        0
country                                 0
school_last                             0
school_previous                         0
degree_last                             0
degree_previous                         0
fos_last                                0
fos_previous                            0
years_of_study                          0
school_count                            0
degree_sum                              0
academic_points                         0
fos_change                              0
school_change                           0
degree_change                           0
last_company_id                         0
last_work_city                          0
last_start_year_month                   0
last_work_country                       0
previous_company_id                     0
previous_work_city                

In [45]:
test.isna().sum()

user_id                                 0
industry                                0
city                                    0
country                                 0
school_last                             0
school_previous                         0
degree_last                             0
degree_previous                         0
fos_last                                0
fos_previous                            0
years_of_study                          0
school_count                            0
degree_sum                              0
academic_points                         0
fos_change                              0
school_change                           0
degree_change                           0
last_company_id                         0
last_work_city                          0
last_start_year_month                   0
last_work_country                       0
previous_company_id                     0
previous_work_city                      0
previous_start_year_month         