In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col

# Initialize Spark Session
#spark = SparkSession.builder.appName("mixedDelimiterExample").getOrCreate()

# Sample data
data = ["1,Alice\t30|New York"]

# Creating a DataFrame with a single column
df = spark.createDataFrame(data, "string")

# Custom logic to split the mixed delimiter row
split_col = split(df['value'], ',|\t|\|')

# Creating new columns for each split part
df = df.withColumn('id', split_col.getItem(0))\
       .withColumn('name', split_col.getItem(1))\
       .withColumn('age', split_col.getItem(2))\
       .withColumn('city', split_col.getItem(3))

# Selecting and showing the result
df.select('id', 'name', 'age', 'city').display()


id,name,age,city
1,Alice,30,New York


In [0]:
#find the missing value from the list 

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Find Missing Numbers").getOrCreate()

# Sample data
data = [(1,), (2,), (4,), (5,), (7,), (8,), (10,)]
df_numbers = spark.createDataFrame(data, ["Number"])

# Generating a complete sequence DataFrame
full_range = spark.range(1, 11).toDF("Number")

# Finding missing numbers
missing_numbers = full_range.join(df_numbers, "Number", "left_anti")
missing_numbers.display()

Number
3
6
9


In [0]:
# Determine the first purchase date for each user.


from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import min

# Initialize Spark session
spark = SparkSession.builder.appName("FirstPurchaseDate").getOrCreate()

# Sample data
purchase_data = [
    Row(UserID=1, PurchaseDate='2023-01-05'),
    Row(UserID=1, PurchaseDate='2023-01-10'),
    Row(UserID=2, PurchaseDate='2023-01-03'),
    Row(UserID=3, PurchaseDate='2023-01-12')
]

# Create DataFrame
df_purchases = spark.createDataFrame(purchase_data)

# Convert PurchaseDate to date type
df_purchases = df_purchases.withColumn("PurchaseDate", col("PurchaseDate").cast("date"))

# Find first purchase date for each user
first_purchase = df_purchases.groupBy("UserID").agg(min("PurchaseDate").alias("FirstPurchaseDate"))

# Show results
first_purchase.display()

UserID,FirstPurchaseDate
1,2023-01-05
2,2023-01-03
3,2023-01-12


In [0]:
#Problem: Given a dataset of sales records, identify and replace all missing values in the 'amount' column with the average sales amount.

from pyspark.sql import SparkSession
from pyspark.sql.functions import mean, col

# Initialize Spark Session
#spark = SparkSession.builder.appName("HandleMissingValues").getOrCreate()

# Sample data for sales - id, amount (with missing values represented by None)
sales_data = [("1", 100), ("2", 150), ("3", None), ("4", 200), ("5", None)]

# Creating DataFrame
sales_df = spark.createDataFrame(sales_data, ["sale_id", "amount"])

# Calculate the average sales amount
avg_amount = sales_df.na.drop().agg(mean(col("amount"))).first()[0]

# Replace missing values with the average amount
sales_df_filled = sales_df.na.fill(avg_amount)

# Show the result
sales_df_filled.display()

sale_id,amount
1,100
2,150
3,150
4,200
5,150


In [0]:
# In a DataFrame df_sales with columns Date, ProductID, and QuantitySold, how would you calculate a 7-day rolling average of QuantitySold for each product?

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.window import Window
import pyspark.sql.functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("RollingAverageCalculation").getOrCreate()

# Sample data
data = [Row(Date='2023-01-01', ProductID=100, QuantitySold=10),
        Row(Date='2023-01-02', ProductID=100, QuantitySold=15),
        Row(Date='2023-01-03', ProductID=100, QuantitySold=20),
        Row(Date='2023-01-04', ProductID=100, QuantitySold=25),
        Row(Date='2023-01-05', ProductID=100, QuantitySold=30),
        Row(Date='2023-01-06', ProductID=100, QuantitySold=35),
        Row(Date='2023-01-07', ProductID=100, QuantitySold=40),
        Row(Date='2023-01-08', ProductID=100, QuantitySold=45)]

# Create DataFrame
df_sales = spark.createDataFrame(data)

# Convert Date string to Date type
df_sales = df_sales.withColumn("Date", F.to_date(F.col("Date")))

# Window specification for 7-day rolling average
windowSpec = Window.partitionBy('ProductID').orderBy('Date').rowsBetween(-6, 0)

# Calculating the rolling average
rollingAvg = df_sales.withColumn('7DayAvg', F.avg('QuantitySold').over(windowSpec))

# Show results
rollingAvg.display()

Date,ProductID,QuantitySold,7DayAvg
2023-01-01,100,10,10.0
2023-01-02,100,15,12.5
2023-01-03,100,20,15.0
2023-01-04,100,25,17.5
2023-01-05,100,30,20.0
2023-01-06,100,35,22.5
2023-01-07,100,40,25.0
2023-01-08,100,45,30.0


In [0]:
#Problem: Given a dataset of sales records with monthly sales per product, reshape the data to have one row per product-month combination.

from pyspark.sql import SparkSession
from pyspark.sql.functions import expr

# Initialize Spark Session
#spark = SparkSession.builder.appName("DataReshaping").getOrCreate()

# Sample data: Product sales per month
data = [("Product1", 100, 150, 200),
        ("Product2", 200, 250, 300),
        ("Product3", 300, 350, 400)]

# Columns: Product, Sales_Jan, Sales_Feb, Sales_Mar
columns = ["Product", "Sales_Jan", "Sales_Feb", "Sales_Mar"]

# Creating DataFrame
df = spark.createDataFrame(data, columns)

# Pivoting the DataFrame
# This step transforms the data into a long format: Product, Month, Sales
pivoted_df = df.selectExpr("Product", 
                           "stack(3, 'Jan', Sales_Jan, 'Feb', Sales_Feb, 'Mar', Sales_Mar) as (Month, Sales)")

# Show the result
pivoted_df.display()


Product,Month,Sales
Product1,Jan,100
Product1,Feb,150
Product1,Mar,200
Product2,Jan,200
Product2,Feb,250
Product2,Mar,300
Product3,Jan,300
Product3,Feb,350
Product3,Mar,400


In [0]:
# Find the count of unique visitors to a website per day.

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import countDistinct

# Initialize Spark session
spark = SparkSession.builder.appName("UniqueVisitorsPerDay").getOrCreate()

# Sample data
visitor_data = [Row(Date='2023-01-01', VisitorID=101),
                Row(Date='2023-01-01', VisitorID=102),
                Row(Date='2023-01-01', VisitorID=101),
                Row(Date='2023-01-02', VisitorID=103),
                Row(Date='2023-01-02', VisitorID=101)]

# Create DataFrame
df_visitors = spark.createDataFrame(visitor_data)

# Count unique visitors per day
unique_visitors = df_visitors.groupBy('Date').agg(countDistinct('VisitorID').alias('UniqueVisitors'))

# Show results
unique_visitors.display()

Date,UniqueVisitors
2023-01-01,2
2023-01-02,2


In [0]:
# How can you use UDFs (User Defined Functions) in PySpark to apply a complex transformation, say, categorizing ages into groups ('Youth', 'Adult', 'Senior')?

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Initialize Spark session
spark = SparkSession.builder.appName("AgeCategorization").getOrCreate()

# Sample data
data = [Row(UserID=4001, Age=17),
        Row(UserID=4002, Age=45),
        Row(UserID=4003, Age=65),
        Row(UserID=4004, Age=30),
        Row(UserID=4005, Age=80)]

# Create DataFrame
df = spark.createDataFrame(data)

# Define UDF to categorize age
def categorize_age(age):
    if age < 18:
        return 'Youth'
    elif age < 60:
        return 'Adult'
    else:
        return 'Senior'

age_udf = udf(categorize_age, StringType())

# Apply UDF to categorize ages
df = df.withColumn('AgeGroup', age_udf(df['Age']))

# Show results
df.display()

UserID,Age,AgeGroup
4001,17,Youth
4002,45,Adult
4003,65,Senior
4004,30,Adult
4005,80,Senior


In [0]:
#find top 3 movie based on the rating

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

# Initialize Spark Session
#spark = SparkSession.builder.appName("TopMovies").getOrCreate()

# Sample DataFrames
data_movies = [(1, "Movie A"), (2, "Movie B"), (3, "Movie C"), (4, "Movie D"), (5, "Movie E")]

data_ratings = [(1, 101, 4.5), (1, 102, 4.0), (2, 103, 5.0), 
                (2, 104, 3.5), (3, 105, 4.0), (3, 106, 4.0), 
                (4, 107, 3.0), (5, 108, 2.5), (5, 109, 3.0)]

columns_movies = ["MovieID", "MovieName"]
columns_ratings = ["MovieID", "UserID", "Rating"]

# Creating DataFrames
df_movies = spark.createDataFrame(data_movies, columns_movies)
df_ratings = spark.createDataFrame(data_ratings, columns_ratings)

# Calculating average ratings
avg_ratings = df_ratings.groupBy('MovieID').agg(avg('Rating').alias('AvgRating'))

# Joining with df_movies to get movie names
top_movies = avg_ratings.join(df_movies, 'MovieID').orderBy('AvgRating', ascending=False).limit(3)

# Showing the top 3 movies
top_movies.display()

MovieID,AvgRating,MovieName
1,4.25,Movie A
2,4.25,Movie B
3,4.0,Movie C


In [0]:
from pyspark.sql.types import StructType

def generate_spark_join(dataframe1_name, dataframe2_name, dataframe1_file_path, dataframe2_file_path, join_column):
    schema1 = StructType() \
        .add("col1", "string") \
        .add("col2", "string") \
        .add("col3", "string") \
        .add("col4", "string") \
        .add("col5", "string")

    schema2 = StructType() \
        .add("col1", "string") \
        .add("col2", "string")

    spark_code = f"""
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("SparkJoinTransformation") \
    .getOrCreate()

# Read text files and create DataFrames
{dataframe1_name} = spark.read.option("header", "true").csv("{dataframe1_file_path}",schema={schema1})
{dataframe2_name} = spark.read.option("header", "true").csv("{dataframe2_file_path}",schema={schema2})

# Perform join transformation
{dataframe1_name}_joined = {dataframe1_name}.join({dataframe2_name}, on='{join_column}', how='inner')

# Show the resulting DataFrame
{dataframe1_name}_joined.show()

"""
    return spark_code

def create_github_release(repo_owner, repo_name, access_token, tag_name, release_name, release_body, asset_file_path):
    g = Github(access_token)
    repo = g.get_repo(f"{repo_owner}/{repo_name}")
    release = repo.create_git_release(tag=tag_name, name=release_name, message=release_body)
    release.upload_asset(asset_file_path)

def main():
    print("Welcome to the Spark Join Transformation Generator!")

    # Read text files and create DataFrames
    dataframe1_file_path = input("Enter the path to the first text file: ")
    dataframe1_name = input("Enter the name for the first DataFrame: ")

    dataframe2_file_path = input("Enter the path to the second text file: ")
    dataframe2_name = input("Enter the name for the second DataFrame: ")

    # Prompt user for join details
    join_column = input("Enter the column to perform the join on: ")
    output_dataframe_name = input("Enter the name for the output DataFrame: ")

    # Generate Spark code for join transformation
    spark_join_code = generate_spark_join(dataframe1_name, dataframe2_name, dataframe1_file_path, dataframe2_file_path, join_column)
    print("\nGenerated Spark Code:")
    print(spark_join_code)

    # Write Spark code to a temporary text file
    with NamedTemporaryFile(mode='w', delete=False) as temp_file:
        temp_file.write(spark_join_code)
        asset_file_path = temp_file.name

    # Deploy the generated code to GitHub as an asset file
    deploy_to_github = input("Do you want to deploy the generated code to GitHub? (yes/no): ")
    if deploy_to_github.lower() == "yes":
        repo_owner = input("Enter the owner of the GitHub repository: ")
        repo_name = input("Enter the name of the GitHub repository: ")
        access_token = input("Enter your GitHub personal access token: ")
        tag_name = input("Enter the tag name for the release: ")
        release_name = input("Enter the name for the release: ")
        release_body = input("Enter the body for the release: ")

        create_github_release(repo_owner, repo_name, access_token, tag_name, release_name, release_body, asset_file_path)
        print("Release created successfully on GitHub!")
    else:
        print("Deployment to GitHub skipped.")

if __name__ == "__main__":
    main()

Welcome to the Spark Join Transformation Generator!


Enter the path to the first text file:  