### Import necessary packages

In [6]:
from pyspark import *
from pyspark.sql import *

### Initializing spark session

In [7]:
spark = SparkSession.builder \
        .appName("Test app for learning") \
        .master("local[3]") \
        .getOrCreate()

### Reading the raw csv file

In [8]:
countries_companies_raw = spark.read \
                            .format("csv") \
                            .option("header","true") \
                            .load("raw_data/companies.csv")

### Cleaning the dataframe

In [9]:
countries_companies_cleaned = countries_companies_raw.select("Company","Profits ($billion)","Sales ($billion)","Country","Continent")\
                                .withColumnsRenamed(
                                    {"Company":"company",
                                     "Profits ($billion)":"profits_in_billions",
                                     "Sales ($billion)":"sales_in_billions",
                                     "Country":"country",
                                    "Continent":"continent"
                                    })

### Writing the dataframe with partitions

In [10]:
countries_companies_cleaned.write \
    .format("parquet") \
    .partitionBy("continent","country") \
    .mode("overwrite") \
    .option("path" ,"cleaned_data/") \
    .save()