# Treinamento PySpark

# Primeiro Sctipt - RDD Operações

In [0]:
# Versão do Python
import sys

print(sys.version)

3.9.5 (default, Nov 23 2021, 15:27:38) 
[GCC 9.3.0]


In [0]:
# Versão do Spark Context
print(sc.version)

3.3.2


In [0]:
# Criando um RDD com 6 elementos, seria a transformação de uma lista em Python para um RDD paralelizado
paises = ["Brasil", "Italia", "Russia", "Noruega", "Espanha", "Mexico"]
dadosrdd = spark.sparkContext.parallelize(paises)

In [0]:
# Exibição de uma coleção, ou seja, todos os dados gravados no RDD sobre países
dadosrdd.collect()

Out[4]: ['Brasil', 'Italia', 'Russia', 'Noruega', 'Espanha', 'Mexico']

In [0]:
# Conta a quantidade de elementos presentes no RDD sobre países
dadosrdd.count()

Out[5]: 6

In [0]:
# Exibindo a quantidade de partições
dadosrdd2 = spark.sparkContext.parallelize(paises)
print("Partições: " + str(dadosrdd2.getNumPartitions()))

Partições: 8


In [0]:
# Froçando a quantidade de patições em 5
dadosrdd3 = spark.sparkContext.parallelize(paises, 5)
print("Partições Paralelas: " + str(dadosrdd3.getNumPartitions()))

Partições Paralelas: 5


In [0]:
# Carregando um arquivo externo para um RDD
food = spark.sparkContext.textFile("/FileStore/food_coded.csv")

In [0]:
# Mostrando os 5 primeiros elementos do RDD, note que o cabeçalho também está disponível
food.take(5)

Out[9]: ['GPA,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,cook,comfort_food_reasons_coded,cuisine,diet_current,diet_current_coded,drink,eating_changes,eating_changes_coded,eating_changes_coded1,eating_out,employment,ethnic_food,exercise,father_education,father_profession,fav_cuisine,fav_cuisine_coded,fav_food,food_childhood,fries,fruit_day,grade_level,greek_food,healthy_feeling,healthy_meal,ideal_diet,ideal_diet_coded,income,indian_food,italian_food,life_rewarding,marital_status,meals_dinner_friend,mother_education,mother_profession,nutritional_check,on_off_campus,parents_cook,pay_meal_out,persian_food,self_perception_weight,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight',
 '2.4,2,1,430,nan,315,1,none,we dont have comfort ,9,2,9,nan,eat good and exercise,1,1,eat faster ,1,1,3,3,1,1,5,profesor ,Arabic cuisine,3,1,rice  and chicken ,2,5,2,

In [0]:
# Será criado um RDD lendo os dados de um arquivo .csv de uma forma diferente, utilizando o Spark Context (sc)
# Arquivo sobre dados de alimentos
food2 = sc.textFile("/FileStore/food_coded.csv")

In [0]:
# Mostrando os 5 primeiros elementos do RDD, que foi carregado utilizando o Spark Context
food2.take(5)

Out[11]: ['GPA,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,cook,comfort_food_reasons_coded,cuisine,diet_current,diet_current_coded,drink,eating_changes,eating_changes_coded,eating_changes_coded1,eating_out,employment,ethnic_food,exercise,father_education,father_profession,fav_cuisine,fav_cuisine_coded,fav_food,food_childhood,fries,fruit_day,grade_level,greek_food,healthy_feeling,healthy_meal,ideal_diet,ideal_diet_coded,income,indian_food,italian_food,life_rewarding,marital_status,meals_dinner_friend,mother_education,mother_profession,nutritional_check,on_off_campus,parents_cook,pay_meal_out,persian_food,self_perception_weight,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight',
 '2.4,2,1,430,nan,315,1,none,we dont have comfort ,9,2,9,nan,eat good and exercise,1,1,eat faster ,1,1,3,3,1,1,5,profesor ,Arabic cuisine,3,1,rice  and chicken ,2,5,2

In [0]:
# Salvando os dados na pasta, extraindo dados do RDD para um arquivo
# Convertendo os dados em uma lista de strings
dados = food.take(5)

dados_str = [";".join(map(str, linha)) for linha in dados]

# Criando um RDD a partir da lista de strings
rdd_dados = sc.parallelize(dados_str)

# Definindo o caminho do arquivo CSV
caminho_arquivo = "/FileStore/5_primeiros.csv"

# Salvando o RDD como um arquivo de texto
rdd_dados.saveAsTextFile(caminho_arquivo)

In [0]:
# Exibindo o primeiro dado dentro do RDD
food.first()

Out[13]: 'GPA,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,cook,comfort_food_reasons_coded,cuisine,diet_current,diet_current_coded,drink,eating_changes,eating_changes_coded,eating_changes_coded1,eating_out,employment,ethnic_food,exercise,father_education,father_profession,fav_cuisine,fav_cuisine_coded,fav_food,food_childhood,fries,fruit_day,grade_level,greek_food,healthy_feeling,healthy_meal,ideal_diet,ideal_diet_coded,income,indian_food,italian_food,life_rewarding,marital_status,meals_dinner_friend,mother_education,mother_profession,nutritional_check,on_off_campus,parents_cook,pay_meal_out,persian_food,self_perception_weight,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight'

In [0]:
# Determinando a quantidade de elementos, subtraindo de -1 para retirar o cabeçalho
food.count() -1

Out[14]: 138

In [0]:
# Pesquisa uma palavra dentro do grupo de dados
filtro_palavra = food.filter(lambda x: 'rice' in x)
filtro_palavra.collect()

Out[15]: ['2.4,2,1,430,nan,315,1,none,we dont have comfort ,9,2,9,nan,eat good and exercise,1,1,eat faster ,1,1,3,3,1,1,5,profesor ,Arabic cuisine,3,1,rice  and chicken ,2,5,2,5,2,looks not oily ,being healthy ,8,5,5,5,1,1,"rice, chicken,  soup",1,unemployed,5,1,1,2,5,3,1,1,1,1165,345,car racing,5,1,1315,187',
 '3.3,1,1,720,4,420,2,"frozen yogurt, pizza, fast food","stress, sadness",1,1,1,3,"toast and fruit for breakfast, salad for lunch, usually grilled chicken and veggies (or some variation) for dinner",3,1,sometimes choosing to eat fast food instead of cooking simply for convenience,1,3,2,3,5,2,2,owns business,italian,1,3,"mac and cheese, pizza, tacos",1,5,3,5,6,usually includes natural ingredients; nonprocessed food,i would say my ideal diet is my current diet,6,6,5,5,7,2,"chicken and rice with veggies, pasta, some kind of healthy recipe",2,owns business,4,2,1,3,5,6,1,2,5,1165,500,none,5,1,900,I\'m not answering this. ',
 '3.5,1,1,720,2,420,2,"Ice cream, chocolate, chips ","Stress,

In [0]:
# Persistindo os dados em memória parta ficar mais rápido, se ouver muitos dados isso acelerará a pesquisa
food.cache()

Out[16]: /FileStore/food_coded.csv MapPartitionsRDD[5] at textFile at NativeMethodAccessorImpl.java:0