In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count, avg, desc, max
spark = SparkSession.builder.appName("DataFrameCheat").getOrCreate()
# Load CSV with schema inference
df = spark.read.csv("filename.csv", header=True, inferSchema=True)

# Filter records
df.filter(col("Salary") > 5000).show()
# Group by column and aggregate
df.groupBy("Department").agg(sum("Salary").alias("TotalSalary")).show()
# Max salary row
df.orderBy(desc("Salary")).limit(1).show()
# Count of records per group
df.groupBy("Genre").agg(count("*").alias("TotalCount")).show()
# Top-N sorting example
df.orderBy(desc("Rating")).select("Title", "Rating").show(3)
# Add new calculated column
df = df.withColumn("TotalRevenue", col("Quantity") * col("Price"))
# Rename column
df = df.withColumnRenamed("OldName", "NewName")
# Save output as CSV
df.write.csv("output.csv", header=True)

.createOrReplaceTempView("table")

In [None]:
#RDD
from pyspark.sql import SparkContext
rdd = sc.textFile("file.txt")
# Parallelize list
rdd = sc.parallelize([1, 2, 3, 4, 5])
# Map, Filter, Reduce
rdd.map(lambda x: x * 2)
rdd.filter(lambda x: x > 50)
rdd.reduce(lambda a, b: a + b)
# Word Count
rdd.flatMap(lambda line: line.split())\
   .map(lambda word: (word, 1))\
   .reduceByKey(lambda a, b: a + b)\
   .sortBy(lambda x: x[1], ascending=False)\
   .collect()

In [None]:
# Kmeans
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
# Load and select features
df = spark.read.csv("customers.csv", header=True, inferSchema=True)
df = df.select("Annual Income (k$)", "Spending Score (1-100)")
# Assemble and Scale
assembler = VectorAssembler(inputCols=["Annual Income (k$)", "Spending Score (1-100)"], outputCol="features")
df_vec = assembler.transform(df)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
df_scaled = scaler.fit(df_vec).transform(df_vec)
# Apply KMeans
kmeans = KMeans(k=3, featuresCol="scaledFeatures", predictionCol="cluster")
model = kmeans.fit(df_scaled)
model.clusterCenters()
df_clustered = model.transform(df_scaled)
df_clustered.select("Annual Income (k$)", "Spending Score (1-100)", "cluster").show()
df_clustered.write.csv("customers_clustered.csv", header=True)

In [None]:
#LR
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
# Load
df = spark.read.csv("houses.txt", header=True, inferSchema=True)
assembler = VectorAssembler(inputCols=["Size (sqft)", "Bedrooms"], outputCol="features")
df_vec = assembler.transform(df).select("features", col("Price ($)").alias("label"))
# Train-Test Split
train, test = df_vec.randomSplit([0.8, 0.2], seed=42)
# Train Model
lr = LinearRegression()
model = lr.fit(train)
# Evaluate
results = model.evaluate(test)
print("R²:", results.r2)
print("RMSE:", results.rootMeanSquaredError)
# Predict
predictions = model.transform(test)
predictions.select("features", "label", "prediction").show()