In [1]:
import warnings
import pandas as pd
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [4]:
def load_skills(path: str, skillset_thr: int = 100, seperator: str = 'skill') -> pd.DataFrame:

    skill_df = pd.read_csv(path)
    print(
        f'Frequency of top 100 skills before preprocess: {skill_df["skill"].value_counts()[:100].values.sum()}')
    skill_df['skill'] = skill_df['skill'].apply(lambda x: x.strip())

    skills_to_replace = {
        'Web Geliştirme': 'Web Development',
        'Web Uygulamaları': 'Web Applications',
        'Proje Yönetimi': 'Project Management',
        'Programlama': 'Programming',
        'Object-Oriented Programming (OOP)': 'OOP',
        'Object Oriented Programming (OOP)': 'OOP',
        'OOP (Nesne Yönelimli Programlama)': 'OOP',
        'OOP(Object Oriented Programming)': 'OOP',
        'OOP (Object Oriented Programming)': 'OOP',
        'Objektorientierte Programmierung (OOP)': 'OOP',
        'Object-Oriented Programming(OOP)': 'OOP',
        'OOP(Object Orianted Programming)': 'OOP',
        'OOP (Nesne Yönelimli Programlama': 'OOP',
        'OOP programming and implementing design patterns': 'OOP',
        'OOP(Object-Oriented Programming)': 'OOP',
        'Nesne Yönelimli Programlama(OOP)': 'OOP',
        'Object Oriented Programming(OOP)': 'OOP',
        'Programación orientada a objetos (OOP)': 'OOP',
        'OOP ( Object Oriented Programming )': 'OOP',
        'OOP (Object-Oriented Programming)': 'OOP',
        'OOP ( Object - Oriented Programming )': 'OOP',
        'OOP(Nesne Yönelimli Programlama)': 'OOP',
        'Object-oriented Programming (OOP)': 'OOP',
        'C, C++ (OOP), Linux': 'OOP',
        'OOP Design': 'OOP',
        'Advanced OOP': 'OOP',
        'Python (Programming Language)': 'Python',
        'Python (Programmiersprache)': 'Python',
        'Python Programming Language': 'Python',
        'Ptyhon': 'Python',
        'Phyton': 'Python',
        'C (Programming Language)': 'C',
        'Swift (Programming Language)': 'Swift',
        'Go (Programming Language)': 'Go',
        'R (Programming Language)': 'R',
        'data science': 'Data Science',
        'Data science': 'Data Science',
        'Microsoft Teknolojileri': 'Microsoft Technologies',
        'Microsoft Sunucular': 'Microsoft Servers',
        'microsoft': 'Microsoft',
        'MsSQL database': 'Microsoft SQL Server',
        'MsSQL Server': 'Microsoft SQL Server',
        'MsSQL': 'Microsoft SQL Server',
        'Amazon Web Services (AWS)': 'AWS',
        'Amazon Web Hizmetleri (AWS)': 'AWS',
        'Amazon Web Services': 'AWS',
        'machine learning': 'Machine Learning',
        'Machine learning': 'Machine Learning',
        'Makine Öğrenmesi/Machine Learning': 'Machine Learning',
        'Artificial Intelligence (AI)': 'Artificial Intelligence',
        'Artificial intelligence': 'Artificial Intelligence',
        'Yazılım Proje Yönetimi': 'Software Project Management',
        'Proje Planlama': 'Project Planning',
        'İngilizce': 'English',
        'english': 'English',
        'git': 'Git',
    }

    for key in skills_to_replace.keys():
        skill_df['skill'] = skill_df['skill'].replace(
            key, skills_to_replace[key])

    print(
        f'Frequency of top 100 skills after preprocess: {skill_df["skill"].value_counts()[:100].values.sum()}')

    skill_df = skill_df.drop_duplicates()
    top_skills = skill_df['skill'].value_counts().keys()[
        :skillset_thr].tolist()
    grouped = skill_df.groupby(
        by='user_id', as_index=False).agg({'skill': 'unique'})
    for skill in tqdm(top_skills):
        grouped[f'{seperator}_{skill}'] = grouped['skill'].apply(
            lambda x: 1 if skill in x else 0)

    grouped = grouped.drop(columns=['skill'], axis=1)
    return grouped
