In [47]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from cities import load_tr_cities, translation
%run skills.ipynb

class config:
    path = '../../../datasets/garanti-bbva-data-camp/'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/languages.csv'


In [63]:
train_df = pd.read_csv(os.path.join(config.path, 'train_users.csv'))
test_df = pd.read_csv(os.path.join(config.path, 'test_users.csv'))
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')

train_df shape: (53019, 4)
test_df shape: (13255, 3)


In [65]:
train_df.head()

Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1
1,6950,Internet,"Istanbul, Istanbul, Turkey",0
2,4880,Online Media,Turkey,0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0
4,11005,Banking,"Istanbul, Turkey",0


In [50]:
test_df.head()

Unnamed: 0,user_id,industry,location
0,17449,Research,Turkey
1,33967,Computer Software,"Istanbul, Istanbul, Turkey"
2,2110,Automotive,Turkey
3,55082,Internet,Turkey
4,37165,Electrical/Electronic Manufacturing,Turkey


In [51]:
df = train_df.append(test_df)

df.head()

Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1.0
1,6950,Internet,"Istanbul, Istanbul, Turkey",0.0
2,4880,Online Media,Turkey,0.0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0.0
4,11005,Banking,"Istanbul, Turkey",0.0


In [52]:
def fix_location(dataframe: pd.DataFrame) -> pd.DataFrame:

    df_ = dataframe.copy()
    df_['location'] = df_['location'].astype(str)
    df_['location'] = df_['location'].apply(
        lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_['location'] = df_['location'].apply(lambda x: x.upper().strip())
    df_['location'] = df_['location'].apply(lambda x: translation(str(x)))
    tr_cities = load_tr_cities()
    for city in tr_cities:
        df_['location'] = df_['location'].apply(
            lambda x: city if city in x else x)

    return df_


In [53]:
df = fix_location(df)

df

Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,ISTANBUL,1.0
1,6950,Internet,ISTANBUL,0.0
2,4880,Online Media,TURKEY,0.0
3,26046,Telecommunications,ISTANBUL,0.0
4,11005,Banking,ISTANBUL,0.0
...,...,...,...,...
13250,32847,Computer Software,TURKEY,
13251,20054,Hospital & Health Care,TURKEY,
13252,7029,Wireless,ISTANBUL,
13253,56130,Information Technology and Services,ANKARA,


In [54]:
skills = load_skills(config.skills_path, 30)

skills.head()

Frequency of top 100 skills before preprocess: 596410
Frequency of top 100 skills after preprocess: 617773


100%|██████████| 30/30 [00:07<00:00,  4.17it/s]


Unnamed: 0,user_id,skill_Java,skill_JavaScript,skill_SQL,skill_C#,skill_HTML,skill_Python,skill_OOP,skill_C,skill_CSS,...,skill_HTML5,skill_.NET,skill_ASP.NET MVC,skill_PHP,skill_Web Development,skill_Yazılım Geliştirme,skill_Visual Studio,skill_Matlab,skill_XML,skill_Programming
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
df = df.merge(skills, on = ['user_id'], how = 'left')

df.head()

Unnamed: 0,user_id,industry,location,moved_after_2019,skill_Java,skill_JavaScript,skill_SQL,skill_C#,skill_HTML,skill_Python,...,skill_HTML5,skill_.NET,skill_ASP.NET MVC,skill_PHP,skill_Web Development,skill_Yazılım Geliştirme,skill_Visual Studio,skill_Matlab,skill_XML,skill_Programming
0,1301,Information Technology and Services,ISTANBUL,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6950,Internet,ISTANBUL,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,4880,Online Media,TURKEY,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,26046,Telecommunications,ISTANBUL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11005,Banking,ISTANBUL,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
target = 'moved_after_2019'
cat_features = ['user_id', 'industry', 'location']

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()]
test_set = df.loc[df[target].isnull()]

print(f'train_set: {train_set.shape}')
print(f'test_set: {test_set.shape}')

train_set: (53019, 34)
test_set: (13255, 34)


In [57]:
X = train_set.drop(columns=[target], axis=1)
y = train_set[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

base_cat = CatBoostClassifier(
    verbose=False,
    random_state=42,
    depth=9,
    bootstrap_type="Bernoulli",
    subsample=0.9,
    one_hot_max_size=150,
    cat_features=cat_features,
    eval_metric="Accuracy",
)

base_cat.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=400,
    verbose=100,
)

y_pred = base_cat.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}", "\n")

Learning rate set to 0.078702
0:	learn: 0.6288351	test: 0.6288351	test1: 0.6248208	best: 0.6248208 (0)	total: 57.5ms	remaining: 57.5s
100:	learn: 0.7060658	test: 0.7060658	test1: 0.6555262	best: 0.6555262 (100)	total: 3.58s	remaining: 31.8s
200:	learn: 0.7333518	test: 0.7333518	test1: 0.6640513	best: 0.6641267 (196)	total: 7.41s	remaining: 29.5s
300:	learn: 0.7528166	test: 0.7528166	test1: 0.6680498	best: 0.6687288 (298)	total: 11.2s	remaining: 25.9s
400:	learn: 0.7680565	test: 0.7680565	test1: 0.6729536	best: 0.6743116 (393)	total: 14.6s	remaining: 21.8s
500:	learn: 0.7798008	test: 0.7798008	test1: 0.6749151	best: 0.6764240 (474)	total: 18s	remaining: 17.9s
600:	learn: 0.7887033	test: 0.7887033	test1: 0.6765749	best: 0.6787627 (569)	total: 21.4s	remaining: 14.2s
700:	learn: 0.7984609	test: 0.7984609	test1: 0.6780837	best: 0.6787627 (569)	total: 24.9s	remaining: 10.6s
800:	learn: 0.8047229	test: 0.8047229	test1: 0.6795172	best: 0.6807243 (775)	total: 28.3s	remaining: 7.03s
900:	learn: 

In [58]:
val_score = accuracy_score(y_test, y_pred)

val_score

0.6840437570728027

In [66]:
sub[target] = base_cat.predict(test_set.drop(columns=[target], axis=1)).astype(int)

sub

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,0
...,...,...
13250,32847,0
13251,20054,0
13252,7029,1
13253,56130,0


In [67]:
sub.to_csv('../submissions/submission_2.csv', index = False)

In [68]:
pd.read_csv('../submissions/submission_2.csv')

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,0
...,...,...
13250,32847,0
13251,20054,0
13252,7029,1
13253,56130,0
