In [115]:
import os
import numpy as np
from utils import *
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
%run skills.ipynb
%run languages.ipynb

class config:
    path = '../../../datasets/garanti-bbva-data-camp/'
    sub_path = '../../../datasets/garanti-bbva-data-camp/submission.csv'
    skills_path = '../../../datasets/garanti-bbva-data-camp/skills.csv'
    languages_path = '../../../datasets/garanti-bbva-data-camp/languages.csv'
    seed = 42


In [116]:
train_df = pd.read_csv(os.path.join(config.path, 'train_users.csv'))
test_df = pd.read_csv(os.path.join(config.path, 'test_users.csv'))
sub = pd.read_csv(config.sub_path)

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')

train_df shape: (53019, 4)
test_df shape: (13255, 3)


In [117]:
train_df.head()

Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1
1,6950,Internet,"Istanbul, Istanbul, Turkey",0
2,4880,Online Media,Turkey,0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0
4,11005,Banking,"Istanbul, Turkey",0


In [118]:
test_df.head()

Unnamed: 0,user_id,industry,location
0,17449,Research,Turkey
1,33967,Computer Software,"Istanbul, Istanbul, Turkey"
2,2110,Automotive,Turkey
3,55082,Internet,Turkey
4,37165,Electrical/Electronic Manufacturing,Turkey


In [119]:
df = train_df.append(test_df)

df.head()

Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,"Istanbul, Istanbul, Turkey",1.0
1,6950,Internet,"Istanbul, Istanbul, Turkey",0.0
2,4880,Online Media,Turkey,0.0
3,26046,Telecommunications,"Istanbul, Istanbul, Turkey",0.0
4,11005,Banking,"Istanbul, Turkey",0.0


In [120]:
def fix_location(dataframe: pd.DataFrame) -> pd.DataFrame:

    df_ = dataframe.copy()
    df_['location'] = df_['location'].astype(str)
    df_['location'] = df_['location'].apply(lambda x: str(x).replace('Türkiye', 'Turkey'))
    df_['location'] = df_['location'].apply(lambda x: x.upper().strip())
    df_['location'] = df_['location'].apply(lambda x: translation(str(x)))
    tr_cities = load_tr_cities()
    for city in tr_cities:
        df_['location'] = df_['location'].apply(lambda x: city if city in x else x)

    return df_


In [121]:
df = fix_location(df)

df

Unnamed: 0,user_id,industry,location,moved_after_2019
0,1301,Information Technology and Services,ISTANBUL,1.0
1,6950,Internet,ISTANBUL,0.0
2,4880,Online Media,TURKEY,0.0
3,26046,Telecommunications,ISTANBUL,0.0
4,11005,Banking,ISTANBUL,0.0
...,...,...,...,...
13250,32847,Computer Software,TURKEY,
13251,20054,Hospital & Health Care,TURKEY,
13252,7029,Wireless,ISTANBUL,
13253,56130,Information Technology and Services,ANKARA,


In [122]:
skills = load_skills(config.skills_path, 30)

languages = load_languages(config.languages_path)

df = df.merge(skills, on = ['user_id'], how = 'left')

df = df.merge(languages, on = ['user_id'], how = 'left')

print(df.shape)

df.head()

Frequency of top 100 skills before preprocess: 596410
Frequency of top 100 skills after preprocess: 617773


100%|██████████| 30/30 [00:07<00:00,  4.17it/s]
100%|██████████| 12/12 [00:01<00:00,  7.53it/s]


(66274, 47)


Unnamed: 0,user_id,industry,location,moved_after_2019,skill_Java,skill_JavaScript,skill_SQL,skill_C#,skill_HTML,skill_Python,...,language_French,language_Spanish,language_Russian,language_Arabic,language_Italian,language_Japanese,language_Azerbaijani,language_Chinese,language_Korean,total_languages
0,1301,Information Technology and Services,ISTANBUL,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,6950,Internet,ISTANBUL,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0
2,4880,Online Media,TURKEY,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,,,,,,,,,,
3,26046,Telecommunications,ISTANBUL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,11005,Banking,ISTANBUL,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [123]:
target = 'moved_after_2019'
cat_features = ['user_id', 'industry', 'location']

for caterogical_col in cat_features:
    df[caterogical_col] = df[caterogical_col].astype(str).astype("category")

train_set = df.loc[df[target].notnull()]
test_set = df.loc[df[target].isnull()]

print(f'train_set: {train_set.shape}')
print(f'test_set: {test_set.shape}')

train_set: (53019, 47)
test_set: (13255, 47)


In [124]:
X = train_set.drop(columns=[target], axis=1)
y = train_set[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

base_cat = CatBoostClassifier(
    verbose=False,
    random_state=42,
    depth=9,
    bootstrap_type="Bernoulli",
    subsample=0.8,
    one_hot_max_size=150,
    cat_features=cat_features,
    eval_metric="Accuracy",
    allow_writing_files=False
)

base_cat.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    early_stopping_rounds=400,
    verbose=100,
)

y_pred = base_cat.predict(X_test)
val_score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {val_score}")
print(f"F1 Score: {f1_score(y_test, y_pred)}", "\n")

Learning rate set to 0.078702
0:	learn: 0.6149532	test: 0.6149532	test1: 0.6150132	best: 0.6150132 (0)	total: 125ms	remaining: 2m 5s
100:	learn: 0.7101901	test: 0.7101901	test1: 0.6580913	best: 0.6586948 (96)	total: 4.01s	remaining: 35.7s
200:	learn: 0.7437632	test: 0.7437632	test1: 0.6706903	best: 0.6715956 (186)	total: 7.47s	remaining: 29.7s
300:	learn: 0.7659944	test: 0.7659944	test1: 0.6792908	best: 0.6798189 (297)	total: 11.1s	remaining: 25.7s
400:	learn: 0.7834976	test: 0.7834976	test1: 0.6806488	best: 0.6811015 (348)	total: 14.6s	remaining: 21.9s
500:	learn: 0.7965748	test: 0.7965748	test1: 0.6832893	best: 0.6832893 (500)	total: 18.2s	remaining: 18.2s
600:	learn: 0.8070365	test: 0.8070365	test1: 0.6846473	best: 0.6852508 (550)	total: 21.8s	remaining: 14.5s
700:	learn: 0.8167438	test: 0.8167438	test1: 0.6875896	best: 0.6883440 (694)	total: 25.3s	remaining: 10.8s
800:	learn: 0.8271049	test: 0.8271049	test1: 0.6901547	best: 0.6901547 (800)	total: 29s	remaining: 7.21s
900:	learn: 0.

In [125]:
sub[target] = base_cat.predict(test_set.drop(columns=[target], axis=1)).astype(int)

sub.head()

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,0


In [126]:
sub.tail()

Unnamed: 0,user_id,moved_after_2019
13250,32847,0
13251,20054,0
13252,7029,1
13253,56130,0
13254,16036,0


In [127]:
sub.to_csv(f'../submissions/submission_{round(val_score, 6)}.csv', index = False)