## Analisando os dados do arquivo "Person.Person.csv"

In [1]:
#importando as bibliotecas necessarias

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
#Enviar os dados para o HDFS(executado no terminal do Linux)
!hdfs dfs -mkdir /user/ronnan/data/data_project_rox
!hdfs dfs -put input/data_project_rox/*.csv /user/ronnan/data/data_project_rox

mkdir: `/user/ronnan/data/data_project_rox': File exists
put: `input/data_project_rox/*.csv': No such file or directory


In [3]:
#Verificando se os dados já estão no HDFS
!hdfs dfs -ls /user/ronnan/data/data_project_rox

Found 6 items
-rw-r--r--   3 root supergroup   13646947 2022-06-19 22:17 /user/ronnan/data/data_project_rox/Person.Person.csv
-rw-r--r--   3 root supergroup     104823 2022-06-19 22:17 /user/ronnan/data/data_project_rox/Production.Product.csv
-rw-r--r--   3 root supergroup    1813963 2022-06-19 22:17 /user/ronnan/data/data_project_rox/Sales.Customer.csv
-rw-r--r--   3 root supergroup   13801182 2022-06-19 22:17 /user/ronnan/data/data_project_rox/Sales.SalesOrderDetail.csv
-rw-r--r--   3 root supergroup    8267704 2022-06-19 22:17 /user/ronnan/data/data_project_rox/Sales.SalesOrderHeader.csv
-rw-r--r--   3 root supergroup      36680 2022-06-19 22:17 /user/ronnan/data/data_project_rox/Sales.SpecialOfferProduct.csv


In [4]:
#criando data_frame para tratamento dos dados, realizando leitura em csv

data_person = spark.read.csv("/user/ronnan/data/data_project_rox/Person.Person.csv"
                            ,sep = ";",inferSchema=True, header=True,ignoreLeadingWhiteSpace=True)

In [5]:
# verificando se o esquema foi atribuido
print(data_person.printSchema())

root
 |-- BusinessEntityID: integer (nullable = true)
 |-- PersonType: string (nullable = true)
 |-- NameStyle: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- MiddleName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Suffix: string (nullable = true)
 |-- EmailPromotion: integer (nullable = true)
 |-- AdditionalContactInfo: string (nullable = true)
 |-- Demographics: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)

None


In [6]:
#verificando os dados da coluna BusinessEntityID
data_person.select('BusinessEntityID').describe().show()

+-------+------------------+
|summary|  BusinessEntityID|
+-------+------------------+
|  count|             19972|
|   mean|10763.079411175646|
| stddev|5814.1332719480615|
|    min|                 1|
|    max|             20777|
+-------+------------------+



In [7]:
#verificando os dados da coluna PersonType
data_person.select('PersonType').describe().show()

+-------+----------+
|summary|PersonType|
+-------+----------+
|  count|     19972|
|   mean|      null|
| stddev|      null|
|    min|        EM|
|    max|        VC|
+-------+----------+



In [8]:
#explorando os dados das colunas BusinessEntityID,PersonType,NameStyle
data_person.select('BusinessEntityID','PersonType',"NameStyle").show()

+----------------+----------+---------+
|BusinessEntityID|PersonType|NameStyle|
+----------------+----------+---------+
|               1|        EM|        0|
|               2|        EM|        0|
|               3|        EM|        0|
|               4|        EM|        0|
|               5|        EM|        0|
|               6|        EM|        0|
|               7|        EM|        0|
|               8|        EM|        0|
|               9|        EM|        0|
|              10|        EM|        0|
|              11|        EM|        0|
|              12|        EM|        0|
|              13|        EM|        0|
|              14|        EM|        0|
|              15|        EM|        0|
|              16|        EM|        0|
|              17|        EM|        0|
|              18|        EM|        0|
|              19|        EM|        0|
|              20|        EM|        0|
+----------------+----------+---------+
only showing top 20 rows



In [9]:
# explorando os dados de cada coluna para verificar se a necessidade de alguma normalização

data_person_PersonType = data_person.groupBy("PersonType").count()
data_person_PersonType.show()

data_person_NameStyle = data_person.groupBy("NameStyle").count()
data_person_NameStyle.show()

data_person_Title = data_person.groupBy("Title").count()
data_person_Title.show()

data_person_FirstName = data_person.groupBy("FirstName").count()
data_person_FirstName.show()

data_person_MiddleName = data_person.groupBy("MiddleName").count()
data_person_MiddleName.show()

data_person_LastName = data_person.groupBy("LastName").count()
data_person_LastName.show()

data_person_Suffix = data_person.groupBy("Suffix").count()
data_person_Suffix.show()

data_person_EmailPromotion = data_person.groupBy("EmailPromotion").count()
data_person_EmailPromotion.show()

data_person_AdditionalContactInfo= data_person.groupBy("AdditionalContactInfo").count()
data_person_AdditionalContactInfo.show()

data_person_Demographics= data_person.groupBy("Demographics").count()
data_person_Demographics.show()

data_person_rowguid= data_person.groupBy("rowguid").count()
data_person_rowguid.show()

data_person_ModifiedDate= data_person.groupBy("ModifiedDate").count()
data_person_ModifiedDate.show()



+----------+-----+
|PersonType|count|
+----------+-----+
|        SC|  753|
|        SP|   17|
|        IN|18484|
|        EM|  273|
|        GC|  289|
|        VC|  156|
+----------+-----+

+---------+-----+
|NameStyle|count|
+---------+-----+
|        0|19972|
+---------+-----+

+-----+-----+
|Title|count|
+-----+-----+
| Sra.|    3|
|  Ms.|  415|
|  Mr.|  577|
| Mrs.|    2|
|  Sr.|   11|
|   Ms|    1|
| NULL|18963|
+-----+-----+

+---------+-----+
|FirstName|count|
+---------+-----+
|   Britta|    2|
|    Tyler|   24|
|     Chad|   23|
| Samantha|   46|
|    Shawn|   25|
|  Shannon|   42|
|  Carolyn|   43|
|   Manish|    1|
|      Sue|    1|
|   Damien|   38|
|    Scott|   15|
|Robertson|    1|
|   Lorrin|    2|
|   Sheena|   22|
|    Ruben|   44|
|   Wilson|    1|
|   Kristy|   21|
|  Ioannis|    1|
|    Lucas|   93|
|    Grace|   69|
+---------+-----+
only showing top 20 rows

+----------+-----+
|MiddleName|count|
+----------+-----+
|         K|  361|
|    Hierro|    1|
|     Slon

In [10]:
# criando variavel para ajustar o campo para novo formato de data para date

data_person_data_formatada = data_person.withColumn("ModifiedDate",to_timestamp(col("ModifiedDate")))\
                                        .withColumn("ModifiedDate",to_date(col("ModifiedDate")))

In [11]:
# aplicando regex para alterar as colunas que encontramos variaveis despadronizadas

from pyspark.sql.functions import regexp_replace

In [12]:
# alterando ms. para ms
data_regex = data_person_data_formatada.withColumn('Title', regexp_replace('Title', 'Ms.', 'Ms'))

In [13]:
# alterando sra.para ms
data_regex = data_regex.withColumn('Title', regexp_replace('Title','Sra.','Ms'))

In [14]:
# alterando mrs. para ms
data_regex = data_regex.withColumn('Title', regexp_replace('Title','Mrs.','Ms'))

In [15]:
# alterando sr. para mr
data_regex = data_regex.withColumn('Title', regexp_replace('Title','Sr.','Mr'))

In [16]:
# alterando mr. para mr
data_regex = data_regex.withColumn('Title', regexp_replace('Title','Mr.','Mr'))

In [17]:
data_regex_gpby = data_regex.groupBy("Title").count()
data_regex_gpby.show()

+-----+-----+
|Title|count|
+-----+-----+
|   Mr|  588|
|   Ms|  421|
| NULL|18963|
+-----+-----+



In [18]:
# alterando padronizando a coluna middleName para a coluna estar apenas com a primeira letra do meio do nome
data_regex_upper = data_regex.withColumn('MiddleName', data_regex['MiddleName'].substr(0, 1))

In [19]:
# alterando para maiusculo
data_person_final = data_regex_upper.withColumn('MiddleName', upper(data_regex_upper.MiddleName))

In [20]:
# visualizando os dados salvos
data_person_final.select('MiddleName').show()

+----------+
|MiddleName|
+----------+
|         J|
|         L|
|         N|
|         N|
|         A|
|         H|
|         A|
|         L|
|         N|
|         N|
|         V|
|         B|
|         M|
|         I|
|         B|
|         M|
|         F|
|         L|
|         A|
|         M|
+----------+
only showing top 20 rows



In [21]:
# agrupando os dados para analisar as alterações
data_person_final_gb = data_person_final.groupBy("MiddleName").count()
data_person_final_gb.show()

+----------+-----+
|MiddleName|count|
+----------+-----+
|         K|  382|
|         F|  245|
|         Q|    9|
|         E|  771|
|         T|  284|
|         B|  339|
|         Y|   23|
|         L| 1319|
|         M| 1224|
|         V|  132|
|         U|    5|
|         O|   65|
|         D|  577|
|         C|  940|
|         J| 1133|
|         Z|   17|
|         A| 1360|
|         N| 8641|
|         X|    2|
|         W|  335|
+----------+-----+
only showing top 20 rows



In [22]:
data_person_final

DataFrame[BusinessEntityID: int, PersonType: string, NameStyle: int, Title: string, FirstName: string, MiddleName: string, LastName: string, Suffix: string, EmailPromotion: int, AdditionalContactInfo: string, Demographics: string, rowguid: string, ModifiedDate: date]

In [24]:
# salvando os dados tratados em um unico em formato .parquet e compressão snappy
data_person_final.coalesce(1).write.saveAsTable("data_person_parquet", format="parquet", compression="snappy")

In [25]:
# verificando o arquivo salvo em .parquet
!hdfs dfs -ls /user/hive/warehouse/data_person_parquet

Found 2 items
-rw-r--r--   2 root supergroup          0 2022-06-20 23:08 /user/hive/warehouse/data_person_parquet/_SUCCESS
-rw-r--r--   2 root supergroup    2167247 2022-06-20 23:08 /user/hive/warehouse/data_person_parquet/part-00000-6b8082df-2c98-4ab5-9225-5cf0f34596bc-c000.snappy.parquet


In [27]:
# salvando em um unico arquivo uma versão em .csv já tratada --> 
data_person_final.coalesce(1).write.csv("data_person_csv", header=True)

In [29]:
# verificando o caminho que o arquivo foi salvo
!hdfs dfs -ls /user/root/data_person_csv

Found 2 items
-rw-r--r--   2 root supergroup          0 2022-06-20 23:08 /user/root/data_person_csv/_SUCCESS
-rw-r--r--   2 root supergroup   13500095 2022-06-20 23:08 /user/root/data_person_csv/part-00000-4ec96501-d667-494f-9116-8e48d28434f4-c000.csv


In [None]:
#exportar o arquivo snappy para fora do hdfs(Executar no terminal hdfs dfs)

#hdfs dfs -get /user/hive/warehouse/data_person_final_parquet/part-00000-9a22957a-114f-486e-bfc4-0e411b6c04a5-c000.snappy.parquet  input/data_project_rox/

#sudo mv part-00000-9a22957a-114f-486e-bfc4-0e411b6c04a5-c000.snappy.parquet person.person.parquet

In [None]:
#exportar o arquivo .csv para fora do hdfs(Executar no terminal hdfs dfs)

#hdfs dfs -get /user/root/data_person_final/part-00000-fc79bc9a-f340-4b14-bca5-3c86998d58f5-c000.csv  input/data_project_rox/

#sudo mv part-00000-fc79bc9a-f340-4b14-bca5-3c86998d58f5-c000.csv person.person_tratado.csv

In [None]:
#!hdfs dfs -rm -R /user/root/data_person_final_parquet

#!hdfs dfs -rm -R /user/hive/warehouse/data_person_final

#!hdfs dfs -rm -R /user/hive/warehouse/data_person_final_snappy