#### Will cover follwoing topics
- Dataframe
- Reading Dataset
- Schema 
- Description 
- Adding columns
- Dropping columns 
- Rename columns 

In [43]:
from pyspark.sql import SparkSession

In [44]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [45]:
spark # Here you can see my old sparksession, because I have not stopped it. In order to start new session you have to kill first. 

In [46]:
spark.stop() # Stop the session and create new one

In [47]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [48]:
spark

## How to read a dataframe 

In [50]:
# Lets read the dataset 
df_pyspark = spark.read.option("header", "true").csv("/Volumes/Jagannath/Fall 2024/Cloud Computing/PySpark/gene_data.csv")

In [51]:
df_pyspark.show()

+---------+----------+----------------+------------------+--------------------+
|Gene Name|Gene Count|Gene Length (kb)|Associated Studies| Functional Category|
+---------+----------+----------------+------------------+--------------------+
|    BRCA1|       120|            81.2|                35|          DNA Repair|
|     TP53|        95|            20.1|                40|   Tumor Suppression|
|     APOE|        80|             3.7|                25|    Lipid Metabolism|
|     CFTR|        65|           189.9|                20|       Ion Transport|
|     KRAS|        45|              42|                18| Signal Transduction|
|     EGFR|        60|           189.5|                30|Cell Growth and P...|
+---------+----------+----------------+------------------+--------------------+



In [52]:
df_pyspark.printSchema() 
# One thing to observe is you can see Name, count, gene length all are as String. However count and length contains number, but still its showing as string. Why? 

root
 |-- Gene Name: string (nullable = true)
 |-- Gene Count: string (nullable = true)
 |-- Gene Length (kb): string (nullable = true)
 |-- Associated Studies: string (nullable = true)
 |-- Functional Category: string (nullable = true)



In [53]:
# There is one more function called inferSchema, it should be true to see values as integers 
df_pyspark = spark.read.option("header", "true").csv("/Volumes/Jagannath/Fall 2024/Cloud Computing/PySpark/gene_data.csv", inferSchema = True)

In [56]:
# Check Schema

df_pyspark.show()
df_pyspark.printSchema()

+---------+----------+----------------+------------------+--------------------+
|Gene Name|Gene Count|Gene Length (kb)|Associated Studies| Functional Category|
+---------+----------+----------------+------------------+--------------------+
|    BRCA1|       120|            81.2|                35|          DNA Repair|
|     TP53|        95|            20.1|                40|   Tumor Suppression|
|     APOE|        80|             3.7|                25|    Lipid Metabolism|
|     CFTR|        65|           189.9|                20|       Ion Transport|
|     KRAS|        45|            42.0|                18| Signal Transduction|
|     EGFR|        60|           189.5|                30|Cell Growth and P...|
+---------+----------+----------------+------------------+--------------------+

root
 |-- Gene Name: string (nullable = true)
 |-- Gene Count: integer (nullable = true)
 |-- Gene Length (kb): double (nullable = true)
 |-- Associated Studies: integer (nullable = true)
 |-- Functi

In [57]:
df_pyspark.head(5) # In pandas we get head in dataframe format but in spark we get in list format

[Row(Gene Name='BRCA1', Gene Count=120, Gene Length (kb)=81.2, Associated Studies=35, Functional Category='DNA Repair'),
 Row(Gene Name='TP53', Gene Count=95, Gene Length (kb)=20.1, Associated Studies=40, Functional Category='Tumor Suppression'),
 Row(Gene Name='APOE', Gene Count=80, Gene Length (kb)=3.7, Associated Studies=25, Functional Category='Lipid Metabolism'),
 Row(Gene Name='CFTR', Gene Count=65, Gene Length (kb)=189.9, Associated Studies=20, Functional Category='Ion Transport'),
 Row(Gene Name='KRAS', Gene Count=45, Gene Length (kb)=42.0, Associated Studies=18, Functional Category='Signal Transduction')]

In [60]:
df_pyspark.select("Gene Name") 

DataFrame[Gene Name: string]

In [61]:
df_pyspark.select("Gene Name").show() # This will give you entire Name column 

+---------+
|Gene Name|
+---------+
|    BRCA1|
|     TP53|
|     APOE|
|     CFTR|
|     KRAS|
|     EGFR|
+---------+



In [62]:
# Pick multiple columns 
df_pyspark.select(['Gene Name', 'Gene Count']).show()

+---------+----------+
|Gene Name|Gene Count|
+---------+----------+
|    BRCA1|       120|
|     TP53|        95|
|     APOE|        80|
|     CFTR|        65|
|     KRAS|        45|
|     EGFR|        60|
+---------+----------+



In [63]:
# Check Datatype
df_pyspark.dtypes 

[('Gene Name', 'string'),
 ('Gene Count', 'int'),
 ('Gene Length (kb)', 'double'),
 ('Associated Studies', 'int'),
 ('Functional Category', 'string')]

In [66]:
df_pyspark.describe().show()
# The .describe() function in PySpark is designed to summarize numeric columns by default. It calculates statistics like mean, stddev, min, and max that are only applicable to numeric data types.
# In our dataset, the columns “Gene Name” and “Functional Category” are non-numeric (likely string data types), so .describe() does not calculate meaningful statistics for these fields, leading to the NULL values we see.

24/09/30 12:45:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+---------+------------------+-----------------+------------------+--------------------+
|summary|Gene Name|        Gene Count| Gene Length (kb)|Associated Studies| Functional Category|
+-------+---------+------------------+-----------------+------------------+--------------------+
|  count|        6|                 6|                6|                 6|                   6|
|   mean|     NULL|              77.5|87.73333333333335|              28.0|                NULL|
| stddev|     NULL|26.972207918522354|83.14393944640022| 8.602325267042628|                NULL|
|    min|     APOE|                45|              3.7|                18|Cell Growth and P...|
|    max|     TP53|               120|            189.9|                40|   Tumor Suppression|
+-------+---------+------------------+-----------------+------------------+--------------------+



## Add the drop columns

In [68]:
# Add the columns

df_pyspark.withColumn("Gene Count multiply by 2", df_pyspark['Gene Count']*2) # You can see new column has been added. 

DataFrame[Gene Name: string, Gene Count: int, Gene Length (kb): double, Associated Studies: int, Functional Category: string, Gene Count multiply by 2: int]

In [69]:
df_pyspark.withColumn("Gene Count multiply by 2", df_pyspark['Gene Count']*2).show()

+---------+----------+----------------+------------------+--------------------+------------------------+
|Gene Name|Gene Count|Gene Length (kb)|Associated Studies| Functional Category|Gene Count multiply by 2|
+---------+----------+----------------+------------------+--------------------+------------------------+
|    BRCA1|       120|            81.2|                35|          DNA Repair|                     240|
|     TP53|        95|            20.1|                40|   Tumor Suppression|                     190|
|     APOE|        80|             3.7|                25|    Lipid Metabolism|                     160|
|     CFTR|        65|           189.9|                20|       Ion Transport|                     130|
|     KRAS|        45|            42.0|                18| Signal Transduction|                      90|
|     EGFR|        60|           189.5|                30|Cell Growth and P...|                     120|
+---------+----------+----------------+----------------

In [70]:
# Drop the columns 

df_pyspark = df_pyspark.drop("Gene Count multiply by 2")

In [71]:
df_pyspark.show()

+---------+----------+----------------+------------------+--------------------+
|Gene Name|Gene Count|Gene Length (kb)|Associated Studies| Functional Category|
+---------+----------+----------------+------------------+--------------------+
|    BRCA1|       120|            81.2|                35|          DNA Repair|
|     TP53|        95|            20.1|                40|   Tumor Suppression|
|     APOE|        80|             3.7|                25|    Lipid Metabolism|
|     CFTR|        65|           189.9|                20|       Ion Transport|
|     KRAS|        45|            42.0|                18| Signal Transduction|
|     EGFR|        60|           189.5|                30|Cell Growth and P...|
+---------+----------+----------------+------------------+--------------------+



In [75]:
# Rename the columns 

df_pyspark.withColumnRenamed("Gene Name", "New Gene Name").show()

+-------------+----------+----------------+------------------+--------------------+
|New Gene Name|Gene Count|Gene Length (kb)|Associated Studies| Functional Category|
+-------------+----------+----------------+------------------+--------------------+
|        BRCA1|       120|            81.2|                35|          DNA Repair|
|         TP53|        95|            20.1|                40|   Tumor Suppression|
|         APOE|        80|             3.7|                25|    Lipid Metabolism|
|         CFTR|        65|           189.9|                20|       Ion Transport|
|         KRAS|        45|            42.0|                18| Signal Transduction|
|         EGFR|        60|           189.5|                30|Cell Growth and P...|
+-------------+----------+----------------+------------------+--------------------+



In [76]:
df_pyspark.show() # New name has not changed in df, because we have not assigned the df above 

+---------+----------+----------------+------------------+--------------------+
|Gene Name|Gene Count|Gene Length (kb)|Associated Studies| Functional Category|
+---------+----------+----------------+------------------+--------------------+
|    BRCA1|       120|            81.2|                35|          DNA Repair|
|     TP53|        95|            20.1|                40|   Tumor Suppression|
|     APOE|        80|             3.7|                25|    Lipid Metabolism|
|     CFTR|        65|           189.9|                20|       Ion Transport|
|     KRAS|        45|            42.0|                18| Signal Transduction|
|     EGFR|        60|           189.5|                30|Cell Growth and P...|
+---------+----------+----------------+------------------+--------------------+



In [77]:
df_pyspark = df_pyspark.withColumnRenamed("Gene Name", "New Gene Name")

In [78]:
df_pyspark.show() # Now you can see the updated df

+-------------+----------+----------------+------------------+--------------------+
|New Gene Name|Gene Count|Gene Length (kb)|Associated Studies| Functional Category|
+-------------+----------+----------------+------------------+--------------------+
|        BRCA1|       120|            81.2|                35|          DNA Repair|
|         TP53|        95|            20.1|                40|   Tumor Suppression|
|         APOE|        80|             3.7|                25|    Lipid Metabolism|
|         CFTR|        65|           189.9|                20|       Ion Transport|
|         KRAS|        45|            42.0|                18| Signal Transduction|
|         EGFR|        60|           189.5|                30|Cell Growth and P...|
+-------------+----------+----------------+------------------+--------------------+

