# Section 6

In [15]:
from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, LongType
from pyspark.ml.recommendation import ALS
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor
from pyspark.ml.feature import VectorAssembler
import sys
import codecs

In [16]:
# Create a SparkSession (Note, the config section is only for Windows!)
spark = SparkSession.builder.appName("SparkSQL").master("local[*]").getOrCreate()


In [17]:
DATA_ROOT = "../spark_course_resources"

## Popular Movies

In [9]:
def loadMovieNames():
    movieNames = {}
    with codecs.open(f"{DATA_ROOT}/ml-100k/u.item", "r", encoding='ISO-8859-1', errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])
    
names = loadMovieNames()
    
ratings = spark.read.option("sep", "\t").schema(moviesSchema) \
    .csv(f"{DATA_ROOT}/ml-100k/u.data")
    
print("Training recommendation model...")

als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID").setItemCol("movieID") \
    .setRatingCol("rating")
    
model = als.fit(ratings)

# Manually construct a dataframe of the user ID's we want recs for
userID = 5
userSchema = StructType([StructField("userID", IntegerType(), True)])
users = spark.createDataFrame([[userID,]], userSchema)

recommendations = model.recommendForUserSubset(users, 10).collect()

print("Top 10 recommendations for user ID " + str(userID))

for userRecs in recommendations:
    myRecs = userRecs[1]  #userRecs is (userID, [Row(movieId, rating), Row(movieID, rating)...])
    for rec in myRecs: #my Recs is just the column of recs for the user
        movie = rec[0] #For each rec in the list, extract the movie ID and rating
        rating = rec[1]
        movieName = names[movie]
        print(movieName + str(rating))
        



Training recommendation model...
Top 10 recommendations for user ID 5
Unzipped (1995)8.09460735321045
Roommates (1995)7.99899435043335
Dangerous Beauty (1998)7.425593376159668
Stalker (1979)7.196524620056152
Carried Away (1996)6.729679584503174
Wings of Desire (1987)6.182162761688232
Maya Lin: A Strong Clear Vision (1994)6.112681865692139
Magic Hour, The (1998)6.112635135650635
Kaspar Hauser (1993)6.046888828277588
Chungking Express (1994)5.958040237426758


### Height Regression

In [14]:
# Load up our data and convert it to the format MLLib expects.
inputLines = spark.sparkContext.textFile(f"{DATA_ROOT}/regression.txt")
data = inputLines.map(lambda x: x.split(",")).map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

# Convert this RDD to a DataFrame
colNames = ["label", "features"]
df = data.toDF(colNames)

# Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
# Perhaps you're importing data from a real database. Or you are using structured streaming
# to get your data.

# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

# Now create our linear regression model
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Train the model using our training data
model = lir.fit(trainingDF)

# Now see if we can predict values in our test data.
# Generate predictions using our linear regression model for all features in our
# test dataframe:
fullPredictions = model.transform(testDF).cache()

# Extract the predictions and the "known" correct labels.
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

# Zip them together
predictionAndLabel = predictions.zip(labels).collect()

# Print out the predicted and actual values for each point
for prediction in predictionAndLabel:
  pass # print(prediction)

### Real State

In [13]:
# Load up data as dataframe
data = spark.read.option("header", "true").option("inferSchema", "true")\
    .csv(f"{DATA_ROOT}/realestate.csv")

assembler = VectorAssembler().setInputCols(["HouseAge", "DistanceToMRT", \
                           "NumberConvenienceStores"]).setOutputCol("features")

df = assembler.transform(data).select("PriceOfUnitArea", "features")

# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

# Now create our decision tree
dtr = DecisionTreeRegressor().setFeaturesCol("features").setLabelCol("PriceOfUnitArea")

# Train the model using our training data
model = dtr.fit(trainingDF)

# Now see if we can predict values in our test data.
# Generate predictions using our decision tree model for all features in our
# test dataframe:
fullPredictions = model.transform(testDF).cache()

# Extract the predictions and the "known" correct labels.
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
labels = fullPredictions.select("PriceOfUnitArea").rdd.map(lambda x: x[0])

# Zip them together
predictionAndLabel = predictions.zip(labels).collect()

# Print out the predicted and actual values for each point
for prediction in predictionAndLabel:
  print(prediction)

(19.449999999999996, 11.6)
(19.449999999999996, 12.8)
(19.449999999999996, 12.9)
(12.800000000000011, 13.7)
(12.800000000000011, 13.8)
(19.449999999999996, 14.7)
(12.8, 15.0)
(17.8, 15.4)
(19.449999999999996, 15.5)
(12.8, 15.6)
(17.8, 15.9)
(12.800000000000011, 16.1)
(19.449999999999996, 16.7)
(12.8, 17.4)
(20.433333333333337, 18.3)
(19.449999999999996, 18.8)
(20.433333333333337, 19.0)
(15.8, 19.2)
(26.518518518518526, 21.3)
(30.599999999999994, 22.0)
(12.8, 22.1)
(20.433333333333337, 22.8)
(26.518518518518526, 22.9)
(26.518518518518526, 23.2)
(12.800000000000011, 23.5)
(26.518518518518526, 23.6)
(26.518518518518526, 23.7)
(26.518518518518526, 24.4)
(30.909090909090903, 24.6)
(26.518518518518526, 24.7)
(30.909090909090903, 25.3)
(26.518518518518526, 25.6)
(26.518518518518526, 25.6)
(23.6375, 25.7)
(20.433333333333337, 25.9)
(31.645454545454548, 26.5)
(26.518518518518526, 26.6)
(44.63157894736842, 26.9)
(26.518518518518526, 27.0)
(12.800000000000011, 27.0)
(26.518518518518526, 27.3)
(38

### Customer Orders