In [1]:
# encoding: utf-8 
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.mlab as mlab
from scipy import stats
from datetime import datetime

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import types as T
from pyspark.sql import functions as F

# create SparkContext and SparkSession to process files
sc = SparkContext('local[2]', 'example')
spark = SparkSession(sc)

In [2]:
%matplotlib notebook

In [3]:
caminho = '../data/antigo/csv/'

In [4]:
pessoal = spark.read.format('csv') \
    .option('sep', '|') \
    .option('header', 'True') \
    .load(caminho + '02_pessoal.csv')
pessoal.createOrReplaceTempView("tab_pessoal")

In [5]:
pessoal.toPandas()

Unnamed: 0,Address,DateOfBirth,DriversLicence,Emails,FirstName,GenderCode,GenderCodeDescription,HomePhones,LastName,MaritalStatusCode,MaritalStatusCodeDescription,MobilePhones,NationalityCode,NationalityCodeDescription,SocialMedia,Title,id
0,,1975-11-06,,,,0,Not known,,,,,,,,,,cv12714.xml
1,,,,,Joel,1,Male,,Ferreira Amorim,,,,BR,Brazilian,,,cv24026.xml
2,,,,,,0,Not known,,,,,,,,,,cv32557.xml
3,,,,,Renan,1,Male,,Mesquita Nunes,,,,BR,Brazilian,,,cv11045.xml
4,,,,,Fabiano,1,Male,,Vieira,,,,,,,,cv2166.xml
5,,,,,,0,Not known,,,,,,,,,,cv3875.xml
6,,,B,,Francisco,1,Male,,T. Silva,,,,,,,,cv20771.xml
7,,,,,Gabriel,1,Male,,Silva,,,,,,,,cv17433.xml
8,,,,,Fabio,1,Male,,Dias Da Silva,,,,BR,Brazilian,,,cv19518.xml
9,,1994-06-12,,,Miria,2,Female,,Pereira Da Silva,,,,,,,,cv17629.xml


In [6]:
pessoal.describe().toPandas()

Unnamed: 0,summary,Address,DateOfBirth,DriversLicence,Emails,FirstName,GenderCode,GenderCodeDescription,HomePhones,LastName,MaritalStatusCode,MaritalStatusCodeDescription,MobilePhones,NationalityCode,NationalityCodeDescription,SocialMedia,Title,id
0,count,0.0,4698,1379,0.0,17075,20130.0,20130,0.0,16989,7.0,7,0.0,6411,6411,0.0,37,20130
1,mean,,,8.277691727142857E8,,,0.8810730253353204,,,,2.2857142857142856,,,,,,,
2,stddev,,,2.1900713708809633E9,,,0.7006412406342762,,,,1.2535663410560172,,,,,,,
3,min,,1944-04-21,"""""",,Abbas,0.0,Female,,A. Da Costa Filho Roseli Santos Miranda,1.0,Divorced,,AO,American,,JR,cv10001.xml
4,max,,2000-03-10,|,,文件夹内有大量真实数据,2.0,Not known,,随着,5.0,Unmarried,,YU,Yemeni,,Sr.,cv9996.xml


In [15]:
# pizza gendercode
por_genero = spark.sql("""
    select genero
         , count(*) as quantidade
    from (
            select case 
                      when gendercodedescription = 'Male' then 'Masculino'
                      when gendercodedescription = 'Female' then 'Feminino'
                      else 'Não Informado'
                   end as genero
            from tab_pessoal 
         )
    group by genero
    order by genero
""").toPandas()

rotulos = por_genero['genero'].tolist()
quantidades = por_genero['quantidade'].tolist()
explode = * len(quantidades)  

fig, ax = plt.subplots()
ax.pie(quantidades, explode=explode, labels=rotulos, autopct='%1.1f%%',
        shadow=True, startangle=0, wedgeprops={'linewidth': 1}, colors=['lightsalmon', 'skyblue', 'gray'])
ax.set_title(u'Distribuição por Gênero')
ax.axis('equal')

plt.show()

<IPython.core.display.Javascript object>

In [16]:
# histograma dateofbirth
anos = spark.sql("""
    select datediff('2018-06-07', cast(dateofbirth as date)) / float(365.25) as data
    from tab_pessoal
    where dateofbirth is not null
""").toPandas()['data'].tolist()

fig, ax = plt.subplots(figsize=(9, 3))

ax.hist(anos, 20, histtype='stepfilled', facecolor='b', alpha=0.75)
ax.set_title(u'Distribuição de Idade')
ax.set_xlabel(u'Idade')
ax.set_ylabel(u'Quantidade de Pessoas')

fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [17]:
computacao = spark.read.format('csv') \
    .option('sep', '|') \
    .option('header', 'True') \
    .load(caminho + '02_computacao.csv')
computacao.createOrReplaceTempView("tab_computacao")

In [18]:
computacao.toPandas()

Unnamed: 0,ComputerSkillDuration,ComputerSkillName,id
0,,Estratégico,cv12714.xml
1,,PMBOK,cv12714.xml
2,,BPM,cv12714.xml
3,7,RUP,cv12714.xml
4,0,Visio,cv12714.xml
5,,MS-Project,cv12714.xml
6,,MS-Office,cv12714.xml
7,1,ITIL,cv12714.xml
8,,SVN,cv12714.xml
9,,SCRUM,cv12714.xml


In [19]:
computacao.describe().toPandas()

Unnamed: 0,summary,ComputerSkillDuration,ComputerSkillName,id
0,count,108972.0,309695,311983
1,mean,2.922833388393349,4.904761904761905,
2,stddev,3.1477871276563065,7.306878690006848,
3,min,0.0,! DB2,cv10001.xml
4,max,9.0,韀,cv9996.xml


In [20]:
# quantidade de pessoas com conhecimento
spark.sql("""
    select count(*) from (select distinct id from tab_computacao)    
""").collect()[0][0] * 100 / float(997)

2019.0571715145436

In [21]:
# media de conhecimentos de computacao por pessoa
spark.sql("""
    select round(avg(quantidade)) as media 
    from (select id
               , count(*) as quantidade
          from (select id
                     , computerskillname 
                from tab_computacao 
                where computerskillname is not null)
          group by id)
""").collect()[0][0]

17.0

In [22]:
# media de duracao da habilidade de computacao
spark.sql("""
    select round(avg(media)) as tempo
    from (select id
               , round(avg(computerskillduration)) as media
          from tab_computacao 
          where computerskillduration > 0
          group by id)
""").collect()[0][0]

3.0

In [23]:
educacao = spark.read.format('csv') \
    .option('sep', '|') \
    .option('header', 'True') \
    .load(caminho + '02_educacao.csv')
educacao.createOrReplaceTempView("tab_educacao")

In [24]:
educacao.toPandas()

Unnamed: 0,DegreeDirection,DiplomaCode,DiplomaCodeDescription,EducationLevelCode,EducationLevelCodeDescription,EndDate,GradePointAverage,InstituteName,StartDate,id
0,MBA em Gestão de Sistemas de Informação,4,Unknown,4,Bachelor,2004-06-30,,Universidade Católica de Brasília - UCB,,cv12714.xml
1,Administração de Empresas,4,Unknown,7,Course,2000-06-30,,Universidade de Brasília - UnB,,cv12714.xml
2,Processamento de Dados,4,Unknown,7,Course,1996-06-30,,União Educacional de Brasília - UNEB,,cv12714.xml
3,Gerência de Processos de Negócio - BPM (2011BR...,4,Unknown,7,Course,2011-12-31,,CBPP; 174658,,cv12714.xml
4,Implantando e Operando Escritórios de Projeto,4,Unknown,7,Course,2015-11-30,,PMO),,cv12714.xml
5,Contratação de bens e serviços de TI (IN04 e J...,4,Unknown,7,Course,2015-10-31,,Brasília,,cv12714.xml
6,Monitoramento e Avaliação de Indicadores de De...,4,Unknown,7,Course,2015-05-31,,Brasília,,cv12714.xml
7,CPRE; FL Certified Professional for Requiremen...,4,Unknown,7,Course,,,Brasília,,cv12714.xml
8,Workshop Project Model Canvas,4,Unknown,,,2014-06-30,,Brasília,,cv12714.xml
9,13º Encontro de Gerenciamento de Projetos,4,Unknown,,,2013-06-30,,DF,,cv12714.xml


In [25]:
educacao.describe().toPandas()

Unnamed: 0,summary,DegreeDirection,DiplomaCode,DiplomaCodeDescription,EducationLevelCode,EducationLevelCodeDescription,EndDate,GradePointAverage,InstituteName,StartDate,id
0,count,102026,104067.0,104067,103613.0,103613,45049,0.0,68784,9821,107087
1,mean,6.929487357142857E7,3.946361478662785,,6.281287097178925,,,,7.24079250885802E8,,
2,stddev,2.592728047916504E8,0.3973732019997776,,1.389055864733714,,,,8.190461157966871E9,,
3,min,! Lógica de Programação,1.0,Unknown,1.0,Associate Degree,1931-02-28,,!,1930-03-31,cv10001.xml
4,max,; ­ Delphi 7.0; ; ­ Linux Avançado; ; ­ Java B...,4.0,no,8.0,Vocational Education,actualidad,,; ­ ARIS ­DBA­; IBM / Rational (SP­; DBA ­; Pe...,2019-09-01,cv9996.xml


In [26]:
por_nivel = spark.sql("""
    select nivel, count(*) as quantidade from (
    select case 
              when lcase(educationlevelcodedescription) = 'bachelor' then 'curso superior'
              when lcase(educationlevelcodedescription) = 'vocational education' then 'outros'
              when lcase(educationlevelcodedescription) = 'secondary education' then 'ensino médio'
              when lcase(educationlevelcodedescription) = 'master' then 'pós-graduação'
              when lcase(educationlevelcodedescription) = 'university' then 'curso superior'
              when lcase(educationlevelcodedescription) = 'post-master' then 'pós-graduação'
              when lcase(educationlevelcodedescription) = 'course' then 'curso'
              when lcase(educationlevelcodedescription) is null then 'não informado'
           end as nivel
    from tab_educacao 
    -- where educationlevelcodedescription is not null
    )
    group by nivel
    order by nivel
""").toPandas()
por_nivel

Unnamed: 0,nivel,quantidade
0,,1
1,curso,79105
2,curso superior,15708
3,ensino médio,2130
4,não informado,3474
5,outros,358
6,pós-graduação,6311


In [28]:
# pizza educationlevelcodedescription
rotulos = por_nivel['nivel'].tolist()
quantidades = por_nivel['quantidade'].tolist()
explode = [.1] * len(por_nivel)  

fig, ax = plt.subplots()
ax.pie(quantidades, explode=explode, labels=rotulos, autopct='%1.1f%%',
        shadow=True, startangle=45, wedgeprops={'linewidth': 3}, colors=['orange', 'gold', 'springgreen',  'chocolate', 'grey','royalblue'])
ax.set_title(u'Níveis Educacionais Cadastrados')
ax.axis('equal')

plt.show()

<IPython.core.display.Javascript object>

In [30]:
emprego = spark.read.format('csv') \
    .option('sep', '|') \
    .option('header', 'True') \
    .load(caminho + '02_emprego.csv')
emprego.createOrReplaceTempView("tab_emprego")

In [31]:
emprego.toPandas()

Unnamed: 0,Description,EmployerName,EndDate,ExperienceYears,JobTitle,StartDate,id
0,Principais Atividades Desenvolvidas:,,,,,,
1,"Gerenciamento, acompanhamento e controle do si...",,,,,,
2,FNDE e alinhamento do projeto aos objetivos es...,,,,,,
3,acompanhamento do cronograma de desenvolviment...,,,,,,
4,"fábrica de SW.""",Empresa FNDE - Fundo Nacional de Desenvolvimen...,actualidad,2,Sistema de Prestação de Contas do FNDE,2014-12-01,cv12714.xml
5,Principais Atividades Desenvolvidas:,,,,,,
6,Acompanhamento das atividades de documentação ...,,,,,,
7,tramitação e controle de atividades demandadas...,,,,,,
8,por Ordem de Serviço - OS e acompanhamento de ...,,,,,,
9,fábricas de SW. Aferição de produtos e serviço...,,,,,,


In [32]:
emprego.describe().toPandas()

Unnamed: 0,summary,Description,EmployerName,EndDate,ExperienceYears,JobTitle,StartDate,id
0,count,490008,93999,70645,86009,82489,85748,99645
1,mean,149694.08035876552,1720.142857142857,,2.096325257743955,2.411764705882353,1.2222222222222223,1.0
2,stddev,1626010.7439363052,3728.1899284850206,,2.422031129916397,3.0012252399939046,0.9718253158075502,0.0
3,min,,,,ASP,Banco de Crédito de São Paulo (Infra Unix),BPEL,Atendimento
4,max,"· Interface com as áreas, garantindo a qualida...",૩ RHCSA ­ Red Hat Certified System Administrator,|K2 Partnering Solutions|2016-01-31|0|Trainee ...,actualidad,૩ Administração da Segurança de Redes de Compu...,actualidad,cv9996.xml


In [33]:
# quantidade de startdate com enddate
spark.sql("""
    select count(*)
    from tab_emprego
    where startdate is not null
      and enddate is not null
""").collect()[0][0] * 100 / float(emprego.count())

13.765904851750438

In [34]:
# media de mudancas de emprego - PARECE QUE TEM PROBLEMA AQUI
spark.sql("""
    select round(avg(quantidade)) as media
         , min(quantidade) as minimo
         , max(quantidade) as maximo
    from (
            select id
                 , count(*) as quantidade
            from (
                    select id
                         , employername
                         , count(*) as quantidade
                    from tab_emprego
                    group by id
                           , employername
                 )
            group by id
         )
""").toPandas()  #.collect()[0][0]

Unnamed: 0,media,minimo,maximo
0,4.0,1,135


In [35]:
# media de promocoes - PARECE QUE TEM PROBLEMA AQUI
spark.sql("""
            select round(avg(quantidade_cargos - 1)) as media
                 , min(quantidade_cargos - 1) as minimo
                 , max(quantidade_cargos - 1) as maximo
            from (
                    select id
                         , employername
                         , count(*) as quantidade_cargos
                    from tab_emprego
                    group by id
                           , employername
                 )
            where quantidade_cargos > 1
            
""").toPandas()

Unnamed: 0,media,minimo,maximo
0,51.0,1,411127


In [36]:
# media dos anos de experiencia
spark.sql("""
    select round(avg(anos)) from (
    select id, avg(experienceyears) as anos
    from tab_emprego
    group by id)
""").collect()[0][0]

2.0

In [38]:
idioma = spark.read.format('csv') \
    .option('sep', '|') \
    .option('header', 'True') \
    .load(caminho + '02_idioma.csv')
idioma.createOrReplaceTempView("tab_idioma")

In [39]:
idioma.toPandas()

Unnamed: 0,LanguageProficiencyCode,LanguageProficiencyCodeDescription,LanguageSkillCode,LanguageSkillCodeDescription,id
0,4,Advanced,BH,Bihari,cv12714.xml
1,3,Intermediate,EU,Basque,cv12714.xml
2,3,Intermediate,EN,English,cv12714.xml
3,,,,,cv24026.xml
4,,,,,cv32557.xml
5,,,EN,English,cv11045.xml
6,,,FR,French,cv11045.xml
7,3,Intermediate,EN,English,cv2166.xml
8,,,ES,Spanish,cv2166.xml
9,2,Basic Knowledge,ES,Spanish,cv3875.xml


In [40]:
idioma.describe().toPandas()

Unnamed: 0,summary,LanguageProficiencyCode,LanguageProficiencyCodeDescription,LanguageSkillCode,LanguageSkillCodeDescription,id
0,count,11535.0,11535,15016,15015,25267
1,mean,3.41309059384482,,,,
2,stddev,1.228830567623791,,,,
3,min,1.0,Advanced,AA,Afar,cv10001.xml
4,max,6.0,No,ZH,japanese,cv9996.xml


In [41]:
# quantidade de pessoas
spark.sql("""
    select count(*) from (
    select distinct id
    from tab_idioma
    where languageskillcode is not null)
""").collect()[0][0] * 100 / float(997)

998.4954864593782

In [43]:
por_idioma = spark.sql("""
    select idioma
         , count(*) as quantidade 
    from (   select case 
                       when lcase(languageskillcodedescription) = 'english' then 'inglês'
                       when lcase(languageskillcodedescription) = 'spanish' then 'espanhol' 
                       when lcase(languageskillcodedescription) = 'portuguese' then 'português' 
                       when lcase(languageskillcodedescription) is null then 'não informado'
                       else 'outros'
                    end as idioma
             from tab_idioma
             -- where languageskillcodedescription is not null
             --  and lcase(languageskillcodedescription) != 'portuguese'    
               )
    group by idioma
""").toPandas()
por_idioma

Unnamed: 0,idioma,quantidade
0,português,1507
1,inglês,9498
2,espanhol,3071
3,não informado,10252
4,outros,939


In [44]:
# pizza idiomas
rotulos = por_idioma['idioma'].tolist()
quantidades = por_idioma['quantidade'].tolist()
explode = [.1] * len(por_idioma)  

fig, ax = plt.subplots()
ax.pie(quantidades, explode=explode, labels=rotulos, autopct='%1.1f%%',
        shadow=True, startangle=0, wedgeprops={'linewidth': 3}, colors=['springgreen', 'royalblue', 'gold', 'grey', 'chocolate'])
ax.set_title(u'Distribuição por Idioma')
ax.axis('equal')

plt.show()

<IPython.core.display.Javascript object>

In [47]:
# proficiencia ingles
ingles = spark.sql("""
    select proficiencia
         , count(*) as quantidade
    from (
            select case 
                      when lcase(languageproficiencycodedescription) = 'basic knowledge' then 'básico'
                      when lcase(languageproficiencycodedescription) = 'intermediate' then 'intermediário'
                      when lcase(languageproficiencycodedescription) = 'advanced' then 'avançado'
                      when lcase(languageproficiencycodedescription) = 'near native' then 'nativo'
                      when lcase(languageproficiencycodedescription) is null then 'não informado'
                   end as proficiencia
            from tab_idioma
            where lcase(languageskillcodedescription) = 'english'
              --and languageproficiencycodedescription is not null
          )
    group by proficiencia
""").toPandas()
ingles

Unnamed: 0,proficiencia,quantidade
0,intermediário,3239
1,avançado,1895
2,,39
3,básico,1465
4,nativo,549
5,não informado,2311


In [49]:
# pizza proficiencia em ingles
rotulos = ingles['proficiencia'].tolist()
quantidades = ingles['quantidade'].tolist()
explode = [.1] * len(ingles)  

fig, ax = plt.subplots()
ax.pie(quantidades, explode=explode, labels=rotulos, autopct='%1.1f%%',
        shadow=True, startangle=45, wedgeprops={'linewidth': 3}, colors=['lightsalmon', 'springgreen', 'gold', 'chocolate', 'royalblue', 'grey'])
ax.set_title(u'Proficiência em Inglês')
ax.axis('equal')

plt.show()

<IPython.core.display.Javascript object>

In [50]:
outros = spark.read.format('csv') \
    .option('sep', '|') \
    .option('header', 'True') \
    .load(caminho + '02_outros.csv')
outros.createOrReplaceTempView("tab_outros")

In [51]:
outros.toPandas()

Unnamed: 0,AvailabilityCode,AvailabilityCodeDescription,CVComment,CVTitle,CandidatePermission,CandidateStatusCode,CandidateStatusCodeDescription,CurrentEmployer,CurrentJob,ExternalID,ExtraInfo,HighestEducationLevelCode,HighestEducationLevelCodeDescription,Last3Experiences,ProfilePicture,SalaryCode,SalaryCodeDescription,TotalExperienceYears,id
0,,,,,,,,Empresa FNDE - Fundo Nacional de Desenvolvimen...,Sistema de Prestação de Contas do FNDE,,,4,Bachelor,"Gerente de Operações, Gerente de Projetos, Ger...",,,,14,cv12714.xml
1,,,,,,,,Secretaria da Educação,Professor de Informática,,,4,Bachelor,"Técnico de Teste de Software, Instrutor",,,,4,cv24026.xml
2,,,,,,,,Experience,Consultor,,,,,"Consultor, Consultor, Gerente de Projetos",,,,16,cv32557.xml
3,,,,,,,,BANCORBRÁS PARTICIPAÇÕES E EMPRENDIMENTOS S.A,Estagiário,,,,,"Diretor de Marketing, Menor Aprendiz",,,,4,cv11045.xml
4,,,,,,,,POLITEC Ltda,Analista de Sistemas,,,4,Bachelor,"Analista de Sistemas Sênior, Analista de Sistemas",,,,9,cv2166.xml
5,,,,,,,,RSI Informática entrou,Líder de Projetos,,,5,Master,"Analista de Sistemas Sênior, Gerente de Config...",,,,8,cv3875.xml
6,,,,,,,,ALPHA BSB IMÓVEIS LTDA,CHEFE DE TESOURARIA,,,4,Bachelor,"Enc. Administrativo e Financeiro/Faturamento, ...",,,,10,cv20771.xml
7,,,,,,,,Indra Company do Brasil,Gerente de Projetos,,,4,Bachelor,"Gerente de Projetos, Diretor Técnico, Analista...",,,,10,cv17433.xml
8,,,,,0,,,UOLDIVEO Datacenter S/A,Gerente de Projetos,,,5,Master,"Analista de Projetos, Estagiário",,,,16,cv19518.xml
9,,,,,0,,,CTIS-Tecnologia S.A,TELEOPERADORA_PROJETO CAESB,,,4,Bachelor,"ÁREA ADMINISTRATIVA EM ENSINO MÉDIO, TELEOPERA...",,,,3,cv17629.xml


In [52]:
outros.describe().toPandas()

Unnamed: 0,summary,AvailabilityCode,AvailabilityCodeDescription,CVComment,CVTitle,CandidatePermission,CandidateStatusCode,CandidateStatusCodeDescription,CurrentEmployer,CurrentJob,ExternalID,ExtraInfo,HighestEducationLevelCode,HighestEducationLevelCodeDescription,Last3Experiences,ProfilePicture,SalaryCode,SalaryCodeDescription,TotalExperienceYears,id
0,count,0.0,0.0,0.0,0.0,7772.0,0.0,0.0,17494,18140,0.0,0.0,15283.0,15283,16620,0.0,0.0,0.0,17233.0,20130
1,mean,,,,,0.0,,,1.0,7.0,,,3.996074069227246,,,,,,8.498056055242849,
2,stddev,,,,,0.0,,,,,,,1.0061896012612508,,,,,,6.22242377359979,
3,min,,,,,0.0,,,"""Caixa Seguradora"""" da Fábrica de Software da ...","""Analista de Sistemas Sênior """"PJ""""""",,,1.0,Associate Degree,"""Administrador do Campus I e II """"Prefeito"""", ...",,,,0.0,cv10001.xml
4,max,,,,,0.0,,,"área de Automação, OSX Telecomunicações (Visen...",área administrativa,,,7.0,Vocational Education,"é Especialista Técnico em Testes, Líder Técnic...",,,,9.0,cv9996.xml


In [53]:
# ultimo grau de instrucao alcancado
instrucao = spark.sql("""
    select instrucao, count(*) as quantidade
    from (
            select case 
                      when lcase(highesteducationlevelcodedescription) = 'bachelor' then 'curso superior'
                      when lcase(highesteducationlevelcodedescription) = 'vocational education' then 'outros'
                      when lcase(highesteducationlevelcodedescription) = 'secondary education' then 'ensino médio'
                      when lcase(highesteducationlevelcodedescription) = 'master' then 'pós-graduação'
                      when lcase(highesteducationlevelcodedescription) = 'university' then 'curso superior'
                      when lcase(highesteducationlevelcodedescription) = 'post-master' then 'pós-graduação'
                      when lcase(highesteducationlevelcodedescription) = 'course' then 'curso'
                      when lcase(highesteducationlevelcodedescription) is null then 'não informado'
                   end as instrucao
            from tab_outros
           -- where highesteducationlevelcodedescription is not null
    )
    group by instrucao
    order by instrucao
""").toPandas()
instrucao

Unnamed: 0,instrucao,quantidade
0,,1
1,curso superior,9996
2,ensino médio,1181
3,não informado,4847
4,outros,193
5,pós-graduação,3912


In [55]:
# pizza highesteducationlevelcodedescription
rotulos = instrucao['instrucao'].tolist()
quantidades = instrucao['quantidade'].tolist()
explode = [.1] * len(instrucao)  

fig, ax = plt.subplots()
ax.pie(quantidades, explode=explode, labels=rotulos, autopct='%1.1f%%',
        shadow=True, startangle=-45, wedgeprops={'linewidth': 3}, colors=['gold', 'springgreen', 'chocolate', 'grey',  'royalblue'])
ax.set_title(u'Grau de Instrução Atual')
ax.axis('equal')

plt.show()

<IPython.core.display.Javascript object>

In [56]:
anos = spark.sql("""
    select cast(totalexperienceyears as integer) as anos
    from tab_outros
    where totalexperienceyears is not null
      and cast(totalexperienceyears as integer) < 50
""").toPandas()['anos'].tolist()

In [57]:
# histograma totalexperienceyears
fig, ax = plt.subplots(figsize=(9, 3))

ax.hist(anos, 20, histtype='stepfilled', facecolor='b', alpha=0.75)
ax.set_title(u'Distribuição de Tempo de Experiência')
ax.set_xlabel(u'Tempo de Experiência (anos)')
ax.set_ylabel(u'Quantidade de Pessoas')

fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [58]:
soft = spark.read.format('csv') \
    .option('sep', '|') \
    .option('header', 'True') \
    .load(caminho + '02_soft.csv')
soft.createOrReplaceTempView("tab_soft")

In [59]:
soft.toPandas()

Unnamed: 0,SoftSkillName,id
0,habilidade na liderança de equipes,cv12714.xml
1,multidisciplinares para atuação em projetos de...,cv12714.xml
2,experiência em gerenciamento,cv12714.xml
3,levantamento de requisitos,cv12714.xml
4,Facilidade com análise,cv12714.xml
5,organizada,cv12714.xml
6,disciplinada,cv12714.xml
7,proativa,cv12714.xml
8,Gosto de trabalhar em equipe,cv12714.xml
9,Habilidades no relacionamento interpessoal,cv24026.xml


In [60]:
soft.describe().toPandas()

Unnamed: 0,summary,SoftSkillName,id
0,count,17543,33121
1,mean,,
2,stddev,,
3,min,*,cv10001.xml
4,max,; Trabalho em equipe,cv9996.xml


In [61]:
# quantidade de softskillname distintas
spark.sql("""
    select count(*) 
    from (
            select distinct softskillname
            from tab_soft
         )
""").collect()[0][0]

6337