In [None]:
#DataFrame / SQL
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BigDataAnalytics").getOrCreate()
df = spark.read.csv("employees.csv", header=True, inferSchema=True)
df.createOrReplaceTempView("employees")
result = df.filter(col("salary") > 55000).select("name")
#result = spark.sql("SELECT name FROM employees WHERE salary > 55000")
result.show()

#RDD
rdd = spark.sparkContext.textFile("employees.csv")
header = rdd.first()
data = rdd.filter(lambda line: line != header)
parsed = data.map(lambda line: line.split(",")).map(lambda x: (x[1], int(x[3])))
filtered = parsed.filter(lambda x: x[1] > 55000)
filtered.collect()

#Linear Regression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

df = spark.read.csv("experience_salary.csv", header=True, inferSchema=True)
vec_assembler = VectorAssembler(inputCols=["years_experience"], outputCol="features")
data = vec_assembler.transform(df)

lr = LinearRegression(featuresCol="features", labelCol="salary")
model = lr.fit(data)

print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

predict_df = spark.createDataFrame([[8]], ["years_experience"])
predict_df = vec_assembler.transform(predict_df)
prediction = model.transform(predict_df)
prediction.select("prediction").show()

#SVM
from pyspark.ml.classification import LinearSVC

df = spark.read.csv("svm_data.csv", header=True, inferSchema=True)
vec_assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
data = vec_assembler.transform(df)

svm = LinearSVC(featuresCol="features", labelCol="label")
model = svm.fit(data)

print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

predictions = model.transform(data)
predictions.select("features", "label", "prediction").show()

#KMeans
from pyspark.ml.clustering import KMeans

df = spark.read.csv("points.csv", header=True, inferSchema=True)
vec_assembler = VectorAssembler(inputCols=["x", "y"], outputCol="features")
data = vec_assembler.transform(df)

kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(data)

print("Cluster Centers:", model.clusterCenters())

predictions = model.transform(data)
predictions.select("x", "y", "prediction").show()

#Text Mining SVD
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.mllib.linalg.distributed import RowMatrix

df = spark.read.text("lsi_docs.txt").toDF("text")
tokenizer = Tokenizer(inputCol="text", outputCol="words")
words_data = tokenizer.transform(df)

cv = CountVectorizer(inputCol="words", outputCol="features")
cv_model = cv.fit(words_data)
vectorized_data = cv_model.transform(words_data)

vectors_rdd = vectorized_data.select("features").rdd.map(lambda row: row[0].toArray())
row_matrix = RowMatrix(vectors_rdd)

svd = row_matrix.computeSVD(2, computeU=True)

print("Singular Values:", svd.s)
print("First 2 rows of U:", svd.U.rows.take(2))