In [1]:
val rawData = sc.textFile("ml-100k/u.data")

In [2]:
rawData.first()

196	242	3	881250949

In [3]:
val rawRatings = rawData.map(_.split("\t").take(3))

In [5]:
import org.apache.spark.mllib.recommendation.ALS

In [8]:
import org.apache.spark.mllib.recommendation.Rating

In [9]:
val ratings = rawRatings.map { case Array(user, movie, rating) => 
Rating(user.toInt, movie.toInt, rating.toDouble)}

In [10]:
ratings.first()

Rating(196,242,3.0)

### Traing a model on the MovieLens

In [11]:
val model = ALS.train(ratings, 50, 10, 0.01)

In [12]:
model.userFeatures

users MapPartitionsRDD[210] at mapValues at ALS.scala:255

In [13]:
model.userFeatures.count

943

In [14]:
model.productFeatures.count

1682

In [15]:
val predictedRating = model.predict(789, 123)

In [16]:
predictedRating

2.039842264056634

In [17]:
val userId = 789
val K = 10
val topKRecs = model.recommendProducts(userId, K)

In [18]:
println(topKRecs.mkString("\n"))

Rating(789,530,5.976424168642172)
Rating(789,641,5.87802968106169)
Rating(789,182,5.86964485232568)
Rating(789,199,5.643635699341024)
Rating(789,526,5.632784329713259)
Rating(789,511,5.578748776142128)
Rating(789,211,5.564848267553379)
Rating(789,134,5.535656295377457)
Rating(789,179,5.511331563535278)
Rating(789,156,5.492505422932971)


In [20]:
val movies = sc.textFile("ml-100k/u.item")

In [22]:
val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt,
array(1))).collectAsMap()

In [23]:
val moviesForUser = ratings.keyBy(_.user).lookup(789)

In [24]:
println(moviesForUser.size)

33


In [25]:
moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.
   product), rating.rating)).foreach(println)

(Godfather, The (1972),5.0)
(Trainspotting (1996),5.0)
(Dead Man Walking (1995),5.0)
(Star Wars (1977),5.0)
(Swingers (1996),5.0)
(Leaving Las Vegas (1995),5.0)
(Bound (1996),5.0)
(Fargo (1996),5.0)
(Last Supper, The (1995),5.0)
(Private Parts (1997),4.0)


In [26]:
topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)

(Man Who Would Be King, The (1975),5.976424168642172)
(Paths of Glory (1957),5.87802968106169)
(GoodFellas (1990),5.86964485232568)
(Bridge on the River Kwai, The (1957),5.643635699341024)
(Ben-Hur (1959),5.632784329713259)
(Lawrence of Arabia (1962),5.578748776142128)
(M*A*S*H (1970),5.564848267553379)
(Citizen Kane (1941),5.535656295377457)
(Clockwork Orange, A (1971),5.511331563535278)
(Reservoir Dogs (1992),5.492505422932971)


In [27]:
import scala.io.Source

In [29]:
val userMovies = ratings.map{ case Rating(user, product, rating) =>
   (user, product) }.groupBy(_._1)


In [36]:
val rawData = sc.textFile("train_noheader.tsv")

In [37]:
val records = rawData.map(line => line.split("\t"))

In [38]:
records.first()

Array("http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html", "4042", "{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees...

In [39]:
import org.apache.spark.mllib.regression.LabeledPoint

In [40]:
import org.apache.spark.mllib.linalg.Vectors

In [41]:
val data = records.map { r => 
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size -1).toInt
val features = trimmed.slice(4, r.size -1).map(d => if (d ==
"?") 0.0 else d.toDouble)
LabeledPoint(label, Vectors.dense(features))}

In [42]:
data.cache

MapPartitionsRDD[227] at map at <console>:33

In [44]:
val numData = data.count

In [45]:
val nbData = records.map { r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size -1).map(d => if (d == "?") 0.0 
else d.toDouble).map(d => if (d < 0) 0.0 else d)
LabeledPoint(label, Vectors.dense(features))}

### Building Classification Model with Logistic regression

In [47]:
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.classification.NaiveBayes


In [48]:
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.configuration.Algo
import org.apache.spark.mllib.tree.impurity.Entropy

In [49]:
val numIterations = 10
val maxTreeDepth = 5

### Traing LogisticRegression

In [50]:
val lrModel = LogisticRegressionWithSGD.train(data, numIterations)

### Train Support Vector Machine with Gradient D kernel

In [51]:
val svmModel = SVMWithSGD.train(data, numIterations)

### Train Naive Bayes Model

In [52]:
val nbModel = NaiveBayes.train(nbData)

### Train Decision Tree

In [53]:
val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)

## Make Prediction 

In [55]:
val dataPoint = data.first
val prediction = lrModel.predict(dataPoint.features)

In [56]:
val trueLabel = dataPoint.label

In [57]:
val predictions = lrModel.predict(data.map(lp => lp.features))
predictions.take(5)

Array(1.0, 1.0, 1.0, 1.0, 1.0)

### Evaluate the performance of the classification using ROC, F, Accuracy, Precision and Recall

In [59]:
val lrTotalCorrect = data.map { point =>
if (lrModel.predict(point.features) == point.label) 1 else 0}.sum
val lrAccuracy = lrTotalCorrect / data.count

In [60]:
lrTotalCorrect

3806.0

In [61]:
lrAccuracy

0.5146720757268425