<a href="https://colab.research.google.com/github/rganesh203/Pyspark/blob/main/Pyspark_Installation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 #install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment.
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark


In [None]:
import findspark
findspark.init()
findspark.find()

In [None]:
import pyspark

In [None]:
from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark = SparkSession \
       .builder \
       .appName("Our First Spark example") \
       .getOrCreate()

In [None]:
spark

In [None]:
spark.uiWebUrl

In [None]:
print(pyspark.__version__)


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

def create_dataframe(input_string):
    # Initialize a Spark session
    spark = SparkSession.builder.appName("StringToDataFrame").getOrCreate()
    # Convert the input string into an RDD and split lines
    lines_rdd = spark.sparkContext.parallelize(input_string.splitlines())
    # Split the header line and create a DataFrame from the RDD, skipping the header
    header = lines_rdd.first()
    data_rdd = lines_rdd.filter(lambda line: line != header)
    header_columns = header.split(",")
    data_df = data_rdd.map(lambda line: line.split(",")).toDF(header_columns)
    # Filter out rows with "NULL" values in the "age" column
    filtered_df = data_df.filter(~col("age").isin("NULL"))
    return filtered_df
input_string = "id,name,age,score\n1,Jack,NULL,12\n17,Betty,28,11"
dataframe = create_dataframe(input_string)
dataframe.show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

def create_dataframe(input_string):
    # Initialize a Spark session
    spark = SparkSession.builder.appName("StringToDataFrame").getOrCreate()

    # Split the input string into lines and create an RDD
    rdd = spark.sparkContext.parallelize(input_string.split("\n"))

    # Define a function to filter out rows with "NULL"
    def filter_null_rows(row):
        return row != "NULL"

    # Apply the function and create a DataFrame
    filtered_rdd = rdd.filter(filter_null_rows)
    df = filtered_rdd.map(lambda x: tuple(x.split(','))).toDF(['col1', 'col2'])

    # Split the input string into lines
    lines = input_string.strip().split('\n')

    # Extract headers and data
    headers = lines[0].split(',')
    data_lines = lines[1:]

    # Create a list of Row objects
    rows = [Row(**dict(zip(headers, line.split(',')))) for line in data_lines]

    # Create a DataFrame from the list of Row objects
    df = spark.createDataFrame(rows)

    return df

input_string = "header,header\nANNUL,ANNULLED\nnull,NILL\nNULL,NULL"
dataframe = create_dataframe(input_string)
dataframe.show()



In [None]:
!ls /content

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
spark = SparkSession.builder.appName("DataTransformation").getOrCreate()
def create_dataframe_from_string(input_string):
    lines = input_string.split('\n')
    columns = lines[0].split(',')
    data_rows = []
    for line in lines[1:]:
        values = line.split(',')
        data_rows.append(Row(**{columns[i]: values[i] for i in range(len(columns))}))
    rdd = spark.sparkContext.parallelize(data_rows)
    df = spark.createDataFrame(rdd)
    return df
S = "country,population,area\nUK,67m,242000km2"
result_df = create_dataframe_from_string(S)
result_df.show()
spark.stop()

In [None]:
from pyspark.sql import SparkSession
from io import StringIO
import csv

def process_input(input_str, skip_header=True):
    spark = SparkSession.builder.appName("DefectiveRowsRemoval").getOrCreate()

    # Convert the input string to an RDD of lines
    lines_rdd = spark.sparkContext.parallelize(input_str.split('\n'))

    # Skip the header if required
    if skip_header:
        header = lines_rdd.first()
        lines_rdd = lines_rdd.filter(lambda line: line != header)

    # Split each line into columns based on comma separator
    split_rdd = lines_rdd.map(lambda line: next(csv.reader(StringIO(line))))

    # Filter out the rows with any null values
    filtered_rdd = split_rdd.filter(lambda columns: all(col is not None and col.lower() != "NULL" for col in columns))

    # Convert the resulting RDD back to a DataFrame
    schema = split_rdd.first()  # Assuming the first row contains column names
    df = spark.createDataFrame(filtered_rdd, schema=schema)

    return df

input_str = "header,header\nANNUL,ANNULLED\nnull,NILL\nNULL,NULL"
result_df = process_input(input_str, skip_header=True)
result_df.show()
