# Cleaning Language Data

## Libraries & Utilities

In [9]:
import re
import warnings
import numpy as np
import pandas as pd
from utils import *
from tqdm import tqdm
from deep_translator import GoogleTranslator
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')

translator = GoogleTranslator(source='auto', target='en')
path = '../../../datasets/garanti-bbva-data-camp/languages.csv'
output_path = '../../../datasets/garanti-bbva-data-camp/clean_language.csv'

## Load and Check Data

In [10]:
df = pd.read_csv(path)
print(f'language data shape: {df.shape}')
print(f'language classes: {df["language"].nunique()}')
print(f'proficiency classes: {df["proficiency"].nunique()}')
df.head()

language data shape: (76062, 3)
language classes: 513
proficiency classes: 5


Unnamed: 0,user_id,language,proficiency
0,8,İngilizce,full_professional
1,8,Türkçe,native_or_bilingual
2,8,Fransızca,elementary
3,10,ingilizce,
4,11,Turkish,native_or_bilingual


In [11]:
check_missing(df)

Unnamed: 0,feature,n_missing,missing_ratio
2,proficiency,10661,0.140162
0,user_id,0,0.0
1,language,0,0.0


In [12]:
df["language"] = df["language"].apply(lambda x: str(x).strip())

df.loc[
    df["language"].isin(
        [
            "Turkısh",
            "türkçe",
            "turkish",
            "turkçe",
            "Turksih",
            "Türkisch",
            "Türkçe",
            "Turkce",
            "Türkçe",
            "Türkish",
            "Turkçe",
            "Türkce",
            "Turkish,",
            "3- Turkish",
            "Türkce",
            "Türkçe,",
            "Tükçe",
            "TÜRKÇE",
            "Türk",
            "Türke",
            "turkce",
            "Türkçe (Turkish)",
            "Türkçe / Turkish",
            "Türkçe/Turkish",
            "Turkish (Native)",
            "Turkish(native)",
            "Turkish-Mother Language",
            "Turk",
            "TURKISH",
            "Turkish - Native",
            "Turkish:",
            "Turkish (mother tongue)",
            "■ Türkçe ■",
        ]
    ),
    "language",
] = "Turkish"

df.loc[
    df["language"].isin(
        [
            "İngilizce",
            "english",
            "Englisch",
            "İnglizce",
            "ENGLISH",
            "ingilizce",
            "inglizce",
            "İng",
            "İngilizce A2",
            "Engish",
            "İngilice",
            "İngilizce - IELTS 7",
            "İngilizce/English",
            "İngilize",
            "İngilzce",
            "İngizce",
            "İngizice",
            "İngilizice",
            "İngilizce,",
            "İngilzce,",
            "English, Middle (1100-1500)",
            "İngilizce, Orta (1100-1500)",
            "INGILIZCE",
            "İNGİLİZCE",
            "English US",
            "English UK",
            "2- English",
            "2- English",
            "İngilizce / English",
            "English (US)",
            "English, Advanced",
            "English C1",
            "English, Pre-Advance",
            "English (Upper-Intermediate)",
            "English-B2 Upper Intermediate",
            "■ English ■",
            "English(advanced)",
            "English - (YDS : 93,75)",
            "İngilizce (English)",
            "English (B2, Upper-Intermediate)",
            "English (B2)",
            "Advanced English",
            "English - Global Village Sydney Australia",
            "English (Advanced)",
            "English upper intermediate",
            "İngilizce, İyi",
            "İnglilizce",
            "İngilizce-B1 Wimbledon language academy eğitim sürecindeyim.",
            "İngilizce (C1)",
            "İngilizce (orta)",
            "İngilizce iyi",
            "İngilizce (Advanced)",
            "İngilizce, İyi düzeyde",
            "İngilizce-intermediate",
            "Mesleki İng.",
            "İngilizce, B2",
            "İngilizce(C1)",
            "İngilizce ( TOEIC - 725 )",
            "İngilizce ( orta düzeyde )",
            "İngilizce- BELS english school. as upper intermediate",
            "İngilizce (excellent)",
            "İngilizce, iyi",
            "Mesleki İngilizce",
            "İngiliz",
            "English - Professional working proficiency",
            "Ingilizce",
            "English,",
            "İngilizce Pre-Intermediate",
            "İngilizce(Orta)",
            "İngilizce(Orta Seviye)",
            "İngilizce | B1",
            "İngilizce (Upper-Intermediate)",
            "İngilizce, ileri",
            "İngilizce(B2)",
            "İngilizce (B1 - B2)",
            "İngilizce, Upper Intermediate",
            "İngilizce, B1",
            "İngilizce, İleri (2500-3000)",
            "İngilizce, Orta/İleri Düzey",
            "İngilizce %30",
            "orta düzey İngilizce",
            "Engilish,",
            "-English",
            "ingilizce(english)",
            "Englis",
            "Inglizce",
            "En",
            "ingilizce - başlangıç",
            "ingilizce (B2)",
        ]
    ),
    "language",
] = "English"

df.loc[
    df["language"].isin(
        [
            "Germany",
            "almanca",
            "GERMAN",
            "Almanca",
            "ALMANCA",
            "german",
            "Almanca(Beginner)",
            "German (Intermediate)",
            "German Language",
            "Almanca B2",
            "Almanca, Orta Yüksek (yaklaşık 1050-1500)",
            "German, B2.2 Goethe Instıtut - İZMİR",
            "Deutsche",
            "Deutsch",
            "Almanca(A1)",
            "Almanca (Düşük Seviye)",
            "Almanca (başlangıç)",
            "Almanca/German",
            "Almanca / Deutsch",
            "Germanic languages",
            "German(beginner)",
            "German (A2)",
            "German (Beginner)",
            "Gerrman",
            "Almanca (IAnfänger A2) (Elementary)",
            "Almanca (basic)",
            "German (Deutsches Sprachdiplom - 2.Stufe)",
            "Deutsch - B1",
            "Deutch",
            "germany",
            "German,",
        ]
    ),
    "langugage",
] = "German"

df.loc[
    df["language"].isin(
        [
            "arapça",
            "arapca",
            "arabic",
            "Arabic",
            "Arapca",
            "Arabic (only very basic speaking skills)",
            "Arabe",
            "Arapça-A1-A2",
            "Arabish",
            "South Sudanese Arabic",
            "Arapça",
            "Arapça(Temel Düzeyde)",
        ]
    ),
    "language",
] = "Arabic"

df.loc[
    df["language"].isin(
        [
            "ISPANYOLCA",
            "Español",
            "ispanyolca",
            "SPANISH",
            "Espanol",
            "İspanyolca",
            "Başlangıç seviyesinde İspanyolca",
            "İspanyolca, Başlangıç",
            "Spani",
            "Espańol",
        ]
    ),
    "language",
] = "Spanish"

df.loc[
    df["language"].isin(
        [
            "İtalian",
            "italian",
            "Italyanca",
            "İtalyanca",
            "italyanca",
            "Italien",
            "Italiano",
            "italiano",
        ]
    ),
    "language",
] = "Italian"

df.loc[
    df["language"].isin(
        [
            "Fransizca",
            "Fransızca",
            "Fransızca(Université Galatasaray)",
            "Französisch",
            "France",
            "Francais",
            "Fransa",
            "Fransız",
            "1- French",
            "Français",
            "French (Beginner)",
        ]
    ),
    "language",
] = "French"


df.loc[
    df["language"].isin(
        [
            "Chinese (Simplified)",
            "Elementary Chinese",
            "Chinese(Simplified-Mandarin) - 中文",
            "Çince (Mandarin)",
            "Çince (Basitleştirilmiş)",
            "Çinçe",
            "Çince",
            "CHINESE",
        ]
    ),
    "language",
] = "Chinese"

df.loc[
    df["language"].isin(
        [
            "Japanese(Beginner)",
            "Japanese (Roomaji)",
            "Japanesse(Roomaji)",
            "Japonca | A1",
            "Japonca",
            "japonca",
        ]
    ),
    "language",
] = "Japanese"

df.loc[
    df["language"].isin(
        [
            "Rusça",
            "rusca",
            "rusça",
            "Russian Русский Язык",
            "russian",
            "Rusca",
            "Russian(Certified)",
            "Russe",
            "Rusça (Beginner A1)",
            "Russain",
        ]
    ),
    "language",
] = "Russian"

df.loc[df["language"] == "Makedonca", "language"] = "Macedonian"
df.loc[df["language"] == "Makedonski", "language"] = "Macedonian"
df.loc[df["language"] == "Türkçe İşaret Dili", "language"] = "Turkish Sign Language"
df.loc[df["language"] == "Kürtçe", "language"] = "Kurdish"
df.loc[df["language"] == "Korece", "language"] = "Korean"
df.loc[df["language"] == "Bulgarca", "language"] = "Bulgarian"
df.loc[df["language"] == "Azerice", "language"] = "Azerbaijani"
df.loc[df["language"] == "Azərbaycan", "language"] = "Azerbaijani"
df.loc[df["language"] == "Azerbaycan Türkçesi", "language"] = "Azerbaijani"
df.loc[df["language"] == "Portekizce", "language"] = "Portuguese"
df.loc[df["language"] == "Yunanca", "language"] = "Greek"
df.loc[df["language"] == "Latince", "language"] = "Latin"
df.loc[df["language"] == "kürdi", "language"] = "Kurdish"
df.loc[df["language"] == "Kurdî", "language"] = "Kurdish"
df.loc[df["language"] == "Macarca", "language"] = "Hungarian"
df.loc[df["language"] == "Litvanyaca", "language"] = "Lithuanian"
df.loc[df["language"] == "Korece | A2", "language"] = "Korean"
df.loc[df["language"] == "Dutch (beginner)", "language"] = "Dutch"
df.loc[df["language"] == "İşaret Dilleri", "language"] = "Sign Languages"
df.loc[
    df["language"] == "Türkçe, Osmanlıca (1500-1928)", "language"
] = "Turkish, Ottoman (1500-1928)"
df.loc[df["language"] == "Boşnakça", "language"] = "Bosnian"
df.loc[df["language"] == "Türk İşaret Dili", "language"] = "Turkish Sign Language"
df.loc[
    df["language"] == "İngilizce, Eski (yaklaşık 450-1100)", "language"
] = "English, Old (ca.450-1100)"
df.loc[df["language"] == "Farsça", "language"] = "Persian"
df.loc[df["language"] == "Sırpça", "language"] = "Serbian"
df.loc[df["language"] == "İsveççe", "language"] = "Swedish"
df.loc[df["language"] == "Kazakça", "language"] = "Kazakh"
df.loc[df["language"] == "Arnavutça", "language"] = "Albanian"
df.loc[df["language"] == "Çekçe", "language"] = "Czech"
df.loc[df["language"] == "Özbekçe", "language"] = "Uzbek"
df.loc[df["language"] == "Hırvatça", "language"] = "Croatian"
df.loc[df["language"] == "Bokmål, Norveç", "language"] = "Norwegian"
df.loc[df["language"] == "Norveççe", "language"] = "Norwegian"
df.loc[df["language"] == "Osmanlı Türkçesi", "language"] = "Ottoman Turkish"
df.loc[df["language"] == "Slovakça", "language"] = "Slovak"
df.loc[df["language"] == "Endonezya dili", "language"] = "Indonesian"
df.loc[df["language"] == "İşaret Dilleri", "language"] = "Sign Languages"
df.loc[df["language"] == "Gürcüce", "language"] = "Georgian"
df.loc[df["language"] == "Türkmence", "language"] = "Turkmen"
df.loc[df["language"] == "Türkmen", "language"] = "Turkmen"
df.loc[df["language"] == "İbranice", "language"] = "Hebrew"
df.loc[df["language"] == "turkmen", "language"] = "Turkmen"
df.loc[df["language"] == "Turkmence", "language"] = "Turkmen"
df.loc[df["language"] == "Türkmençe", "language"] = "Turkmen"
df.loc[df["language"] == "Ermenice", "language"] = "Armenian"


In [13]:
language_translated = dict()
for i in tqdm(df['language'].dropna().unique()):
    language_translated[i] = translator.translate(i)
for key in language_translated.keys():
    df.loc[df['language'] == key, 'language'] = language_translated[key]

df.loc[df['language'].notnull(), 'language'] = df.loc[df['language'].notnull(), 'language'].apply(lambda x: x.lower().strip())
df.loc[df['language'].notnull(), 'language'] = df.loc[df['language'].notnull(), 'language'].apply(lambda x: translation(x))
    
print(f'language data shape: {df.shape}')
print(f'language classes: {df["language"].nunique()}')
print(f'proficiency classes: {df["proficiency"].nunique()}')
df.to_csv(output_path, index = False)
df.head()

100%|██████████| 263/263 [02:32<00:00,  1.72it/s]


language data shape: (76062, 4)
language classes: 208
proficiency classes: 5


Unnamed: 0,user_id,language,proficiency,langugage
0,8,english,full_professional,
1,8,turkish,native_or_bilingual,
2,8,french,elementary,
3,10,english,,
4,11,turkish,native_or_bilingual,
