<a href="https://colab.research.google.com/github/saurater/ciencia_de_dados_pyspark/blob/main/PySpark_Intro_Parte_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark - Tutorial - Part 1 - MLib -Linear Regression - Intro
Notebook by Sam Faraday June 2022

Sources:
Free Code Camp: PySpark Tutorial at https://www.youtube.com/watch?v=_C8kWso4ne4

Apache Spark API Refernce at https://spark.apache.org/docs/latest/api/python/reference/index.html

# 1. Installing PySpark

In [None]:
pip install pyspark

# 2. Initializing PySpark

In [None]:
from pyspark.sql import SparkSession


In [None]:
spark = SparkSession.builder.appName("MLIB_Regressao").getOrCreate()

In [None]:
spark

# 3. Reading the Dataset

In [None]:
df_training = spark.read.csv("vendas_de_cafe.csv", header = True, inferSchema=True)

In [None]:
df_training.show()

+--------+-----------+----------+--------+-----------+
|  Regiao|Vendas Cafe|Preco Cafe|Promoção|Preco Leite|
+--------+-----------+----------+--------+-----------+
|   Norte|         18|      4.77|     Nao|       4.74|
|   Norte|         20|      4.67|     Nao|       4.81|
|   Norte|         23|      4.75|     Nao|       4.36|
|   Norte|         23|      4.74|     Nao|       4.29|
|   Norte|         23|      4.63|     Nao|       4.17|
|   Norte|         23|      4.56|     Nao|       4.66|
|   Norte|         24|      4.59|     Nao|       4.73|
|   Norte|         25|      4.75|     Nao|       4.11|
|     Sul|         26|      4.75|     Sim|       4.21|
|     Sul|         26|      4.49|     Nao|       4.25|
|     Sul|         26|      4.41|     Sim|       4.62|
|     Sul|         26|      4.32|     Nao|       4.53|
|     Sul|         27|      4.68|     Nao|       4.44|
|     Sul|         28|      4.66|     Sim|       4.19|
|     Sul|         28|      4.42|     Sim|       4.37|
|Nordeste|

# 4. Checking the Schema

In [None]:
df_training.printSchema()

root
 |-- Regiao: string (nullable = true)
 |-- Vendas Cafe: integer (nullable = true)
 |-- Preco Cafe: double (nullable = true)
 |-- Promoção: string (nullable = true)
 |-- Preco Leite: double (nullable = true)



# 5. Análise Inicial dos Dados

In [None]:
df_training.summary().show()

+-------+--------+-----------------+-------------------+--------+------------------+
|summary|  Regiao|      Vendas Cafe|         Preco Cafe|Promoção|       Preco Leite|
+-------+--------+-----------------+-------------------+--------+------------------+
|  count|      30|               30|                 30|      30|                30|
|   mean|    null|             30.0|  4.426333333333332|    null| 4.373666666666667|
| stddev|    null|7.310832774866962|0.32205678802447824|    null|0.2558081626765535|
|    min|Nordeste|               18|               3.73|     Nao|               4.0|
|    25%|    null|               25|               4.35|    null|              4.17|
|    50%|    null|               28|               4.47|    null|              4.31|
|    75%|    null|               34|               4.67|    null|              4.62|
|    max|     Sul|               46|               4.77|     Sim|              4.81|
+-------+--------+-----------------+-------------------+--------+

# 6. Checando o tipo do no dataframe

In [None]:
type(df_training)

pyspark.sql.dataframe.DataFrame

# 7. Verificando as Colunas

In [None]:
df_training.columns

['Regiao', 'Vendas Cafe', 'Preco Cafe', 'Promoção', 'Preco Leite']

# 8. Alterando os nomes de Colunas / Features

In [None]:
df_training.withColumnRenamed("Vendas Cafe", "Vendas_Cafe").show()

+--------+-----------+----------+--------+-----------+
|  Regiao|Vendas_Cafe|Preco Cafe|Promoção|Preco Leite|
+--------+-----------+----------+--------+-----------+
|   Norte|         18|      4.77|     Nao|       4.74|
|   Norte|         20|      4.67|     Nao|       4.81|
|   Norte|         23|      4.75|     Nao|       4.36|
|   Norte|         23|      4.74|     Nao|       4.29|
|   Norte|         23|      4.63|     Nao|       4.17|
|   Norte|         23|      4.56|     Nao|       4.66|
|   Norte|         24|      4.59|     Nao|       4.73|
|   Norte|         25|      4.75|     Nao|       4.11|
|     Sul|         26|      4.75|     Sim|       4.21|
|     Sul|         26|      4.49|     Nao|       4.25|
|     Sul|         26|      4.41|     Sim|       4.62|
|     Sul|         26|      4.32|     Nao|       4.53|
|     Sul|         27|      4.68|     Nao|       4.44|
|     Sul|         28|      4.66|     Sim|       4.19|
|     Sul|         28|      4.42|     Sim|       4.37|
|Nordeste|

In [None]:

df_training  = df_training.withColumnRenamed("Preco Leite", "Preco_Leite")

In [None]:
df_training.show()

+--------+-----------+----------+--------+-----------+
|  Regiao|Vendas_Cafe|Preco_Cafe|Promocao|Preco_Leite|
+--------+-----------+----------+--------+-----------+
|   Norte|         18|      4.77|     Nao|       4.74|
|   Norte|         20|      4.67|     Nao|       4.81|
|   Norte|         23|      4.75|     Nao|       4.36|
|   Norte|         23|      4.74|     Nao|       4.29|
|   Norte|         23|      4.63|     Nao|       4.17|
|   Norte|         23|      4.56|     Nao|       4.66|
|   Norte|         24|      4.59|     Nao|       4.73|
|   Norte|         25|      4.75|     Nao|       4.11|
|     Sul|         26|      4.75|     Sim|       4.21|
|     Sul|         26|      4.49|     Nao|       4.25|
|     Sul|         26|      4.41|     Sim|       4.62|
|     Sul|         26|      4.32|     Nao|       4.53|
|     Sul|         27|      4.68|     Nao|       4.44|
|     Sul|         28|      4.66|     Sim|       4.19|
|     Sul|         28|      4.42|     Sim|       4.37|
|Nordeste|

# 9. Adicionando Colunas

In [None]:
df_training = df_training.withColumn("Preco_Leite_ 10_porcent", df_training['Preco_Leite'] * 1.1)

In [None]:
df_training.show()

+--------+-----------+----------+--------+-----------+-----------------------+
|  Regiao|Vendas_Cafe|Preco_Cafe|Promocao|Preco_Leite|Preco_Leite_ 10_porcent|
+--------+-----------+----------+--------+-----------+-----------------------+
|   Norte|         18|      4.77|     Nao|       4.74|                  5.214|
|   Norte|         20|      4.67|     Nao|       4.81|                  5.291|
|   Norte|         23|      4.75|     Nao|       4.36|      4.796000000000001|
|   Norte|         23|      4.74|     Nao|       4.29|                  4.719|
|   Norte|         23|      4.63|     Nao|       4.17|      4.587000000000001|
|   Norte|         23|      4.56|     Nao|       4.66|                  5.126|
|   Norte|         24|      4.59|     Nao|       4.73|      5.203000000000001|
|   Norte|         25|      4.75|     Nao|       4.11|      4.521000000000001|
|     Sul|         26|      4.75|     Sim|       4.21|                  4.631|
|     Sul|         26|      4.49|     Nao|       4.2

# 10. Removendo uma Coluna

In [None]:
df_training = df_training.drop('Preco_Leite_ 10_porcent')

In [None]:
df_training.show()

+--------+-----------+----------+--------+-----------+
|  Regiao|Vendas_Cafe|Preco_Cafe|Promocao|Preco_Leite|
+--------+-----------+----------+--------+-----------+
|   Norte|         18|      4.77|     Nao|       4.74|
|   Norte|         20|      4.67|     Nao|       4.81|
|   Norte|         23|      4.75|     Nao|       4.36|
|   Norte|         23|      4.74|     Nao|       4.29|
|   Norte|         23|      4.63|     Nao|       4.17|
|   Norte|         23|      4.56|     Nao|       4.66|
|   Norte|         24|      4.59|     Nao|       4.73|
|   Norte|         25|      4.75|     Nao|       4.11|
|     Sul|         26|      4.75|     Sim|       4.21|
|     Sul|         26|      4.49|     Nao|       4.25|
|     Sul|         26|      4.41|     Sim|       4.62|
|     Sul|         26|      4.32|     Nao|       4.53|
|     Sul|         27|      4.68|     Nao|       4.44|
|     Sul|         28|      4.66|     Sim|       4.19|
|     Sul|         28|      4.42|     Sim|       4.37|
|Nordeste|

# 11. Selecionando Colunas

In [None]:
df_training.columns

['Regiao', 'Vendas_Cafe', 'Preco_Cafe', 'Promocao', 'Preco_Leite']

In [None]:
df_training.select('Vendas_Cafe','Preco_Cafe').show()

+-----------+----------+
|Vendas_Cafe|Preco_Cafe|
+-----------+----------+
|         18|      4.77|
|         20|      4.67|
|         23|      4.75|
|         23|      4.74|
|         23|      4.63|
|         23|      4.56|
|         24|      4.59|
|         25|      4.75|
|         26|      4.75|
|         26|      4.49|
|         26|      4.41|
|         26|      4.32|
|         27|      4.68|
|         28|      4.66|
|         28|      4.42|
|         29|      4.71|
|         29|      4.66|
|         30|      4.46|
|         30|      4.36|
|         31|      4.47|
+-----------+----------+
only showing top 20 rows

