In [None]:
pip install pyspark

In [None]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("Swiggy").getOrCreate()

# Load the dataset
file_path = "/content/drive/MyDrive/Swiggy.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the schema and first 5 rows of the DataFrame
df.printSchema()
df.show(5)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Create a Spark session
spark = SparkSession.builder.appName("ColumnRemovalExample").getOrCreate()

# Assuming df is your PySpark DataFrame
df_filtered = df.filter((col("rating").cast("string") != "--") & (col("cost") != "₹"))

# List of columns to be removed
columns_to_remove = ["licension no", "restaurant link", "menu", "city link", "subcity", "subcity link", "restaurant code", "price"]

# Remove specified columns
df_filtered = df_filtered.drop(*columns_to_remove)

# Replace "₹" with "Rs" in the "cost" column
df_filtered = df_filtered.withColumn("cost", when(col("cost") == "₹", "Rs").otherwise(col("cost")))

# Show the updated DataFrame
df_filtered.show()


In [None]:
from pyspark.sql.functions import isnan, when, count, col
# Count null values in each column of the updated DataFrame
null_counts = df_filtered.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_filtered.columns]).show()


In [None]:
# Drop rows with null values
df_no_nulls = df_filtered.na.drop()

# Show the updated DataFrame with no nulls
df_no_nulls.show()

In [None]:
from pyspark.sql.functions import isnan, when, count, col
# Count null values in each column of the updated DataFrame
null_counts = df_no_nulls.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_no_nulls.columns]).show()

In [None]:
# Specify the output path for the CSV file
output_path = "/content/drive/MyDrive/CA640"

# Reduce the number of partitions to 1
df_single_partition = df_no_nulls.coalesce(1)

# Write the DataFrame to a single CSV file
df_single_partition.write.csv(output_path, header=True, mode="overwrite")


In [None]:
import pandas as pd

# Replace 'your_csv_file.csv' with the actual file path
csv_file_path = '/content/drive/MyDrive/CA640/part-00000-a7e7c7f2-61bd-4889-97f3-d589dfa81b67-c000.csv'

# Read data from CSV into a DataFrame
df = pd.read_csv(csv_file_path)

# Display the DataFrame
print(df)
