# Lenguajes de programación

No todos los lenguajes de programación son iguales, pero sí existen lenguajes que son similares. JavaScript se parece más a TypeScript que a C, por ejemplo. En el primer modelo que hicimos tomamos a cada lenguaje como una característica independiente pero quizás esto se puede mejorar usando descripciones de los lenguajes de programación en vez de sólo los nombres.

In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import wikipedia
import xml.etree.ElementTree as ET
import re
%matplotlib inline

In [5]:
df = pd.read_csv('2020.1 - sysarmy - Encuesta de remuneración salarial Argentina - Argentina.csv', skiprows=9)
df['Lenguajes de programación']

0                                                  Python
1                                                PHP, SQL
2                 CSS, HTML, Javascript, PHP, Python, SQL
3        .NET, C#, CSS, HTML, Javascript, SQL, TypeScript
4                                                     NaN
                              ...                        
5977     .NET, C#, CSS, HTML, Javascript, SQL, TypeScript
5978    .NET, C#, COBOL, CSS, HTML, Java, Javascript, SQL
5979                                                  NaN
5980                .NET, HTML, Javascript, PHP, SQL, VBA
5981                CSS, HTML, Javascript, PHP, Ruby, SQL
Name: Lenguajes de programación, Length: 5982, dtype: object

In [133]:
df_languages_cols = df['Lenguajes de programación'].fillna('').apply(lambda pls: pd.Series([v.lower().strip() for v in pls.split(',') if v.lower().strip() not in ('', 'ninguno')], dtype=str))
count_languages = pd.concat((df_languages_cols[i] for i in range(df_languages_cols.shape[1]))).value_counts()
count_languages

javascript     2339
sql            2084
html           1867
java           1393
python         1386
               ... 
json              1
compass           1
rpg400 y cl       1
less              1
asp clasic        1
Length: 187, dtype: int64

In [134]:
count_languages = df_languages[df_languages > 10]
count_languages

javascript      2339
sql             2084
html            1867
java            1393
python          1386
css             1223
php              949
bash/shell       896
.net             848
c#               751
typescript       626
c++              306
c                280
go               267
vba              247
groovy           183
ruby             177
kotlin           152
r                118
swift             95
scala             94
abap              75
cobol             69
objective-c       69
perl              66
delphi            56
dart              52
assembler         44
actionscript      41
matlab            35
powershell        34
smalltalk         30
genexus           19
elixir            16
rust              14
apex              11
dtype: int64

In [20]:
lplbt = wikipedia.page('List_of_programming_languages_by_type')
lplbt

<WikipediaPage 'List of programming languages by type'>

In [42]:
print(lplbt.html())

<div class="mw-parser-output"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Wikimedia list article</div>
<table class="vertical-navbox nowraplinks" style="float:right;clear:right;width:22.0em;margin:0 0 1.0em 1.0em;background:#f8f9fa;border:1px solid #aaa;padding:0.2em;border-spacing:0.4em 0;text-align:center;line-height:1.4em;font-size:88%;width:18.0em;"><tbody><tr><th class="navbox-title" style="padding:0.2em 0.4em 0.2em;font-size:145%;line-height:1.2em"><a href="/wiki/Lists_of_programming_languages" title="Lists of programming languages">Programming language<br />lists</a></th></tr><tr><td style="padding:0 0.1em 0.4em;text-align: left;">
<ul><li><a href="/wiki/List_of_programming_languages" title="List of programming languages">Alphabetical</a></li>
<li><a class="mw-selflink selflink">Categorical</a></li>
<li><a href="/wiki/Timeline_of_programming_languages" title="Timeline of programming languages">Chronological</a></li>
<li><a href="/wiki/

In [29]:
root = ET.fromstring(lplbt.html())
root

<Element 'div' at 0x7f8131e46270>

In [121]:
current_category = None
languages_categories = {}
all_categories = set()
for el in root.iter('*'):
    if el.attrib.get('class', None) == 'mw-headline':
        current_category = ''.join(el.itertext()).strip()
    if current_category not in (None, 'See also'):
        for li in el.iter('li'):
            lang = ''.join(li.itertext())
            if '\n' in lang: continue
            lang = re.sub(r'(?:\(|–|—|-|and some dialects|,).+', '', lang).strip()
            if lang == '': continue
            if lang not in languages_categories:
                languages_categories[lang] = set()
            languages_categories[lang].add(current_category)
            all_categories.add(current_category)
all_categories

{'Array languages',
 'Authoring languages',
 'Client side',
 'Command line interface languages',
 'Compiled languages',
 'Concurrent languages',
 'Constraint programming languages',
 'Curly-bracket languages',
 'Data-oriented languages',
 'Dataflow languages',
 'Decision table languages',
 'Declarative languages',
 'Educational languages',
 'Esoteric languages',
 'Extension languages',
 'Fourth-generation languages',
 'Garbage collected languages',
 'HDLs for analog circuit design',
 'HDLs for digital circuit design',
 'Imperative languages',
 'Impure',
 'In object code',
 'Interactive mode languages',
 'Interpreted languages',
 'Iterative languages',
 'Languages with automatic reference counting (ARC)',
 'Languages with deterministic memory management',
 'Languages with manual memory management',
 'List-based languages – LISPs',
 'Little languages',
 'Logic-based languages',
 'Machine languages',
 'Metaprogramming languages',
 'Multiparadigm languages',
 'Multiple dispatch',
 'Non-Eng

In [125]:
lang_cats = {}
for lang, lang_cat in languages_categories.items():
    lang_cats[lang.lower()] = {c: c in lang_cat for c in all_categories}

df_lang_cats = pd.DataFrame(lang_cats).T
df_lang_cats

Unnamed: 0,Metaprogramming languages,Impure,Machine languages,Languages with manual memory management,Curly-bracket languages,Multiple dispatch,Reflective Language,Non-English-based languages,Interpreted languages,Offline rendering,...,Visual languages,Multiparadigm languages,Synchronous languages,Constraint programming languages,HDLs for analog circuit design,Client side,Dataflow languages,Fourth-generation languages,Iterative languages,Imperative languages
a+,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
analytica,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
apl,False,True,False,False,False,False,False,False,True,False,...,False,True,False,False,False,False,False,False,False,False
chapel,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
fortran 90,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
lzx,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
xaml,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
xpath,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
xquery,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [136]:
df_languages = pd.DataFrame(count_languages.rename('count')).join(df_lang_cats, how='inner')
df_languages

Unnamed: 0,count,Metaprogramming languages,Impure,Machine languages,Languages with manual memory management,Curly-bracket languages,Multiple dispatch,Reflective Language,Non-English-based languages,Interpreted languages,...,Visual languages,Multiparadigm languages,Synchronous languages,Constraint programming languages,HDLs for analog circuit design,Client side,Dataflow languages,Fourth-generation languages,Iterative languages,Imperative languages
javascript,2339,False,True,False,False,True,False,True,False,True,...,False,True,False,False,False,True,False,False,False,True
sql,2084,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
java,1393,False,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
python,1386,True,True,False,False,False,False,True,False,True,...,False,True,False,False,False,False,False,False,True,True
php,949,False,True,False,False,True,False,True,False,True,...,False,True,False,False,False,False,False,False,True,True
c#,751,False,True,False,False,True,False,True,False,False,...,False,True,False,False,False,False,False,False,True,True
typescript,626,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
c++,306,True,True,False,True,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
c,280,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
go,267,False,False,False,False,True,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True


In [146]:
df_languages = df_languages.drop(df_languages.sum()[df_languages.sum() <= 1].keys(), axis=1)
df_languages

Unnamed: 0,count,Metaprogramming languages,Impure,Languages with manual memory management,Curly-bracket languages,Reflective Language,Interpreted languages,Concurrent languages,Extension languages,Compiled languages,...,In object code,Single dispatch,Numerical analysis,Object-oriented prototype-based languages,Interactive mode languages,Multiparadigm languages,Client side,Fourth-generation languages,Iterative languages,Imperative languages
javascript,2339,False,True,False,True,True,True,False,True,False,...,False,False,False,True,True,True,True,False,False,True
sql,2084,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
java,1393,False,True,False,False,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,True
python,1386,True,True,False,False,True,True,False,True,True,...,True,True,False,False,True,True,False,False,True,True
php,949,False,True,False,True,True,True,False,False,False,...,False,True,False,False,True,True,False,False,True,True
c#,751,False,True,False,True,True,False,False,False,True,...,False,True,False,False,False,True,False,False,True,True
typescript,626,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
c++,306,True,True,True,True,False,False,False,False,True,...,False,True,False,False,False,True,False,False,False,True
c,280,False,False,True,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
go,267,False,False,False,True,False,False,True,False,True,...,False,False,False,False,False,True,False,False,False,True
