In [2]:
import re
import warnings
import pandas as pd
from tqdm import tqdm
from langdetect import detect
from utils import translation, fix_skills
from deep_translator import GoogleTranslator
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')

def my_tokenizer(text):
    return re.split("\\s+",text)

In [52]:
def load_skills(path: str, size: int = 50, seperator: str = "skill") -> pd.DataFrame:

    df_ = pd.read_csv(path)
    print(
        f'Frequency of top 20 skills before preprocess: {df_["skill"].value_counts()[:20].values.sum()}'
    )

    df_ = fix_skills(df_)
    df_["skill"] = df_["skill"].apply(lambda x: str(x).lower().strip())
    df_["skill"] = df_["skill"].apply(lambda x: translation(str(x)))

    print(
        f'Frequency of top 20 skills after preprocess: {df_["skill"].value_counts()[:20].values.sum()}'
    )

    df_ = df_.drop_duplicates()
    total_skills_df = df_.groupby(by="user_id", as_index=False).agg(
        total_skills=(
            "skill",
            lambda x: len([s.strip() for s in ", ".join(x.unique()).split(",")]),
        )
    )
    
    df_['skill'] = df_['skill'].apply(lambda x: x.replace(',', ' '))
    df_ = df_.groupby(by="user_id", as_index=False).agg(
        {"skill": lambda x: " ".join(x.unique())}
    )

    vectorizer = CountVectorizer(
        max_features=size,
        stop_words=stopwords.words("english"),
        ngram_range=(1, 2),
        tokenizer=my_tokenizer,
    )

    return (
        pd.DataFrame(
            vectorizer.fit_transform(df_["skill"]).toarray(),
            columns=[f"{seperator}_{str(f)}" for f in vectorizer.get_feature_names()],
        )
        .assign(user_id=df_["user_id"])
        .merge(total_skills_df, on=["user_id"], how="left")
    )

    # df_ = df_.drop_duplicates()
    # total_skills = df_.groupby(by="user_id", as_index=False).agg(
    #    total_skills=(
    #        "skill",
    #        lambda x: len([s.strip() for s in ", ".join(x.unique()).split(",")]),
    #    )
    # )
    # most_freq_skills = df_["skill"].value_counts().keys()[:size].tolist()
    # grouped = df_.groupby(by="user_id", as_index=False).agg(
    #    {
    #        "skill": lambda x: [s.strip() for s in ", ".join(x.unique()).split(",")],
    #    }
    # )
    # for skill in tqdm(most_freq_skills):
    #    grouped[f"{seperator}_{skill}"] = grouped["skill"].apply(
    #        lambda x: 1 if skill in x else 0
    #    )
    # grouped = grouped.merge(total_skills, on = ['user_id'], how = 'left')
    # grouped = grouped.drop(columns=["skill"], axis=1)
    # return df_


In [57]:
#df = load_skills('../../../datasets/garanti-bbva-data-camp/skills.csv')
#
#df

In [56]:
#df.columns

In [55]:
#df.columns

In [27]:
#df.loc[df['skill'].str.contains('SQL', regex=False), 'skill'].value_counts()[20:40]

In [28]:
#for i in df.loc[df['skill'].str.contains('Programlama'), 'skill'].value_counts()[:50].keys():
#    translated = GoogleTranslator(source='auto', target='en').translate(i)
#    if df.loc[df['skill'] == translated].shape[0] != 0:
#        print(f'df.loc[df["skill"] == "{i}", "skill"] = "{translated.title()}"')

In [29]:
#df.loc[df['skill'].str.contains('Object Orie'), 'skill'].value_counts()[:20]

In [30]:
#df['skill'].value_counts()

In [55]:
#tr_chars = ['İ', 'ğ', 'ı', 'ş', 'Ğ', 'Ş', 'Ö', 'ö', 'Ü', 'ü', 'Ç', 'ç', 'bilgisayar']
#translated = dict()
#
#for c in tr_chars:
#    sub_df = df.loc[df['skill'].str.contains(c)]
#    if sub_df.shape[0] != 0:
#        for s in tqdm(sub_df['skill'].unique()):
#            if s not in translated.keys():
#                translated[s] = GoogleTranslator(source='auto', target='en').translate(s)

In [56]:
#translated[s] = GoogleTranslator(source='auto', target='en').translate(s)

In [5]:
#df['user_id'].astype(int)

In [15]:
#skill_df = pd.DataFrame(
#    {
#        "user_id": ["A", "A", "B", "B"],
#        "skill": [
#            "html",
#            "css ,application",
#            "web development, mobile application development",
#            " hadoop, oop",
#        ],
#    }
#)
#
#
#skill_df["skill"] = skill_df["skill"].apply(lambda x: str(x).lower().strip())
#skill_df["skill"] = skill_df["skill"].apply(lambda x: translation(str(x)))
#skill_df["skill"] = skill_df["skill"].apply(lambda x: x.replace(",,", ","))
#
#total_skills = skill_df.groupby(by="user_id", as_index=False).agg(
#    total_skills=(
#        "skill",
#        lambda x: len([s.strip() for s in ", ".join(x.unique()).split(",")]),
#    )
#)
#
#
#skill_df['skill'] = skill_df['skill'].apply(lambda x: x.replace(',', ' '))
#skill_df = skill_df.groupby(by="user_id", as_index=False).agg(
#    {"skill": lambda x: " ".join(x.unique())}
#)
#
#
#vectorizer = CountVectorizer(
#    max_features=5, stop_words=stopwords.words("english"), ngram_range=(1, 2), tokenizer=my_tokenizer
#)
#
#
#pd.DataFrame(
#        vectorizer.fit_transform(skill_df["skill"]).toarray(),
#        columns=[f"skill_{str(f)}" for f in vectorizer.get_feature_names()],
#    ).assign(user_id=skill_df["user_id"]).merge(total_skills, on=["user_id"], how="left")

Unnamed: 0,skill_application,skill_application development,skill_css,skill_css application,skill_development,user_id,total_skills
0,1,0,1,1,0,A,3
1,1,1,0,0,2,B,4


In [46]:
#df_ = df_.drop_duplicates()
#total_skills = df_.groupby(by="user_id", as_index=False).agg(
#    total_skills=(
#        "skill",
#        lambda x: len([s.strip() for s in ", ".join(x.unique()).split(",")]),
#    )
#)
#df_ = df_.groupby(by="user_id", as_index=False).agg(
#    {"skill": lambda x: " ".join(x.unique())}
#)
#
#vectorizer = CountVectorizer(
#    max_features=size, stop_words=stopwords.words("english"), ngram_range=(1, 2), tokenizer=my_tokenizer
#)
#
#return (
#    pd.DataFrame(
#        vectorizer.fit_transform(df_["skill"]).toarray(),
#        columns=[f"{seperator}_{str(f)}" for f in vectorizer.get_feature_names()],
#    )
#    .assign(user_id=df_["user_id"])
#    .merge(total_skills, on=["user_id"], how="left")
#)

In [21]:
#'A, B'.replace(', ', ',').split(',')

In [11]:
#df['skill'].value_counts()[:20]

In [12]:
#df.loc[df['skill'].str.contains('c#'), 'skill'].value_counts()

In [None]:
#df = df.groupby(by = 'user_id', as_index=False).agg({'skill': lambda x: ' '.join(x.unique())})
#
#vectorizer = CountVectorizer(max_features=200, stop_words=stopwords.words("english"))
#
#pd.DataFrame(
#    vectorizer.fit_transform(df["skill"]).toarray(),
#    columns=vectorizer.get_feature_names(),
#).assign(user_id = df['user_id'])

In [None]:
#df.loc[df['skill'].str.contains('Back', regex=False), 'skill'].value_counts()[:20]