In [1]:
val rawData = sc.textFile("ml-100k/u.data")

In [2]:
rawData.first()

196	242	3	881250949

In [3]:
val rawRatings = rawData.map(_.split("\t").take(3))

In [5]:
import org.apache.spark.mllib.recommendation.ALS

In [8]:
import org.apache.spark.mllib.recommendation.Rating

In [9]:
val ratings = rawRatings.map { case Array(user, movie, rating) => 
Rating(user.toInt, movie.toInt, rating.toDouble)}

In [10]:
ratings.first()

Rating(196,242,3.0)

### Traing a model on the MovieLens

In [11]:
val model = ALS.train(ratings, 50, 10, 0.01)

In [12]:
model.userFeatures

users MapPartitionsRDD[210] at mapValues at ALS.scala:255

In [13]:
model.userFeatures.count

943

In [14]:
model.productFeatures.count

1682

In [15]:
val predictedRating = model.predict(789, 123)

In [16]:
predictedRating

2.039842264056634

In [17]:
val userId = 789
val K = 10
val topKRecs = model.recommendProducts(userId, K)

In [18]:
println(topKRecs.mkString("\n"))

Rating(789,530,5.976424168642172)
Rating(789,641,5.87802968106169)
Rating(789,182,5.86964485232568)
Rating(789,199,5.643635699341024)
Rating(789,526,5.632784329713259)
Rating(789,511,5.578748776142128)
Rating(789,211,5.564848267553379)
Rating(789,134,5.535656295377457)
Rating(789,179,5.511331563535278)
Rating(789,156,5.492505422932971)


In [20]:
val movies = sc.textFile("ml-100k/u.item")

In [22]:
val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt,
array(1))).collectAsMap()

In [23]:
val moviesForUser = ratings.keyBy(_.user).lookup(789)

In [24]:
println(moviesForUser.size)

33


In [25]:
moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.
   product), rating.rating)).foreach(println)

(Godfather, The (1972),5.0)
(Trainspotting (1996),5.0)
(Dead Man Walking (1995),5.0)
(Star Wars (1977),5.0)
(Swingers (1996),5.0)
(Leaving Las Vegas (1995),5.0)
(Bound (1996),5.0)
(Fargo (1996),5.0)
(Last Supper, The (1995),5.0)
(Private Parts (1997),4.0)


In [26]:
topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)

(Man Who Would Be King, The (1975),5.976424168642172)
(Paths of Glory (1957),5.87802968106169)
(GoodFellas (1990),5.86964485232568)
(Bridge on the River Kwai, The (1957),5.643635699341024)
(Ben-Hur (1959),5.632784329713259)
(Lawrence of Arabia (1962),5.578748776142128)
(M*A*S*H (1970),5.564848267553379)
(Citizen Kane (1941),5.535656295377457)
(Clockwork Orange, A (1971),5.511331563535278)
(Reservoir Dogs (1992),5.492505422932971)


In [27]:
import scala.io.Source

In [29]:
val userMovies = ratings.map{ case Rating(user, product, rating) =>
   (user, product) }.groupBy(_._1)


In [36]:
val rawData = sc.textFile("train_noheader.tsv")

In [37]:
val records = rawData.map(line => line.split("\t"))

In [38]:
records.first()

Array("http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html", "4042", "{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees...

In [39]:
import org.apache.spark.mllib.regression.LabeledPoint

In [40]:
import org.apache.spark.mllib.linalg.Vectors

In [41]:
val data = records.map { r => 
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size -1).toInt
val features = trimmed.slice(4, r.size -1).map(d => if (d ==
"?") 0.0 else d.toDouble)
LabeledPoint(label, Vectors.dense(features))}

In [42]:
data.cache

MapPartitionsRDD[227] at map at <console>:33

In [44]:
val numData = data.count

In [45]:
val nbData = records.map { r =>
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val features = trimmed.slice(4, r.size -1).map(d => if (d == "?") 0.0 
else d.toDouble).map(d => if (d < 0) 0.0 else d)
LabeledPoint(label, Vectors.dense(features))}

### Building Classification Model with Logistic regression

In [47]:
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.classification.NaiveBayes


In [48]:
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.configuration.Algo
import org.apache.spark.mllib.tree.impurity.Entropy

In [49]:
val numIterations = 10
val maxTreeDepth = 5

### Traing LogisticRegression

In [50]:
val lrModel = LogisticRegressionWithSGD.train(data, numIterations)

### Train Support Vector Machine with Gradient D kernel

In [51]:
val svmModel = SVMWithSGD.train(data, numIterations)

### Train Naive Bayes Model

In [52]:
val nbModel = NaiveBayes.train(nbData)

### Train Decision Tree

In [53]:
val dtModel = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)

## Make Prediction 

In [55]:
val dataPoint = data.first
val prediction = lrModel.predict(dataPoint.features)

In [56]:
val trueLabel = dataPoint.label

In [57]:
val predictions = lrModel.predict(data.map(lp => lp.features))
predictions.take(5)

Array(1.0, 1.0, 1.0, 1.0, 1.0)

### Evaluate the performance of the classification using ROC, F, Accuracy, Precision and Recall

In [59]:
val lrTotalCorrect = data.map { point =>
if (lrModel.predict(point.features) == point.label) 1 else 0}.sum
val lrAccuracy = lrTotalCorrect / data.count

In [60]:
lrTotalCorrect

3806.0

In [61]:
lrAccuracy

0.5146720757268425

Gives 51 percent, not very impressive

In [62]:
val svmTotalCorrect = data.map { point =>
if (svmModel.predict(point.features) == point.label) 1 else 0}.sum
val nbTotalCorrect = nbData.map { point =>
if (nbModel.predict(point.features) == point.label) 1 else 0}.sum

In [63]:
val dtTotalCorrect = data.map { point =>
val score = dtModel.predict(point.features)
val predicted = if (score > 0.5) 1 else 0
if (predicted == point.label) 1 else 0}.sum

In [64]:
val svmAccuracy = svmTotalCorrect / numData

In [67]:
svmAccuracy

0.5146720757268425

In [65]:
val nbAccuracy = nbTotalCorrect / numData

In [68]:
nbAccuracy

0.5803921568627451

In [66]:
val dtAccuracy = dtTotalCorrect / numData

In [69]:
dtAccuracy

0.6482758620689655

### Precision and recall

In [71]:
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
   val metrics = Seq(lrModel, svmModel).map { model =>
     val scoreAndLabels = data.map { point =>
       (model.predict(point.features), point.label)
     }
     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
     (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.
   areaUnderROC)
   }

In [73]:
val nbMetrics = Seq(nbModel).map { model =>
val scoreAndLabels = nbData.map { point =>
val score = model.predict(point.features)
(if (score > 0.5) 1.0 else 0.0, point.label)}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)}

In [75]:
val dtMetrics = Seq(dtModel).map{ model =>
val scoreAndLabels = data.map { point =>
val score = model.predict(point.features)
(if (score > 0.5) 1.0 else 0.0, point.label)}
val metrics = new BinaryClassificationMetrics(scoreAndLabels)
(model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)}


In [76]:
val allMetrics = metrics ++ nbMetrics ++ dtMetrics
allMetrics.foreach{case (m, pr, roc) => 
println(f"$m, Area under PR: ${pr * 100.0}%2.4f%%, Area under ROC: ${roc * 100.0}%2.4f%%")}

LogisticRegressionModel, Area under PR: 75.6759%, Area under ROC: 50.1418%
SVMModel, Area under PR: 75.6759%, Area under ROC: 50.1418%
NaiveBayesModel, Area under PR: 68.0851%, Area under ROC: 58.3559%
DecisionTreeModel, Area under PR: 74.3081%, Area under ROC: 64.8837%


### Standardize Feature

In [77]:
import org.apache.spark.mllib.linalg.distributed.RowMatrix

In [78]:
val vectors = data.map(lp => lp.features)

In [79]:
val matrix = new RowMatrix(vectors)

In [80]:
val matrixSummary = matrix.computeColumnSummaryStatistics()

In [81]:
println(matrixSummary.mean)

[0.41225805299526774,2.76182319198661,0.46823047328613876,0.21407992638350257,0.0920623607189991,0.04926216043908034,2.255103452212025,-0.10375042752143329,0.0,0.05642274498417848,0.02123056118999324,0.23377817665490225,0.2757090373659231,0.615551048005409,0.6603110209601082,30.077079107505178,0.03975659229208925,5716.598242055454,178.75456389452327,4.960649087221106,0.17286405047031753,0.10122079189276531]


In [83]:
println(matrixSummary.min)

[0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.045564223,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0]


In [85]:
println(matrixSummary.max)

[0.999426,363.0,1.0,1.0,0.980392157,0.980392157,21.0,0.25,0.0,0.444444444,1.0,0.716883117,113.3333333,1.0,1.0,100.0,1.0,207952.0,4997.0,22.0,1.0,1.0]


In [86]:
println(matrixSummary.variance)

[0.10974244167559023,74.30082476809655,0.04126316989120245,0.021533436332001124,0.009211817450882448,0.005274933469767929,32.53918714591818,0.09396988697611537,0.0,0.001717741034662896,0.020782634824610638,0.0027548394224293023,3.6837889196744116,0.2366799607085986,0.22433071201674218,415.87855895438463,0.03818116876739597,7.877330081138441E7,32208.11624742624,10.453009045764313,0.03359363403832387,0.0062775328842146995]


In [87]:
println(matrixSummary.numNonzeros)

[5053.0,7354.0,7172.0,6821.0,6160.0,5128.0,7350.0,1257.0,0.0,7362.0,157.0,7395.0,7355.0,4552.0,4883.0,7347.0,294.0,7378.0,7395.0,6782.0,6868.0,7235.0]


In [88]:
import org.apache.spark.mllib.feature.StandardScaler

In [90]:
val scaler = new StandardScaler(withMean = true, withStd = true).fit(vectors)

In [92]:
val scaledData = data.map(lp => LabeledPoint(lp.label, scaler.transform(lp.features)))

In [93]:
println(data.first.features)

[0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575]


In [94]:
println(scaledData.first.features)

[1.1376473364976751,-0.08193557169294784,1.0251398128933333,-0.05586356442541853,-0.4688932531289351,-0.35430532630793654,-0.3175352172363122,0.3384507982396541,0.0,0.8288221733153222,-0.14726894334628504,0.22963982357812907,-0.14162596909880876,0.7902380499177364,0.7171947294529865,-0.29799681649642484,-0.2034625779299476,-0.03296720969690467,-0.04878112975579767,0.9400699751165406,-0.10869848852526329,-0.27882078231369967]


In [95]:
println((0.789131 - 0.412258) / math.sqrt(0.1097))

1.1378675465938966


In [98]:
val lrModelScaled  = LogisticRegressionWithSGD.train(scaledData, numIterations)
val lrTotalCorrectScaled = scaledDAta.map { point => if (lrModelScaled.predict(point.features) == point.label) 1 else 0}.sum
val lrAccuracyScaled = lrTotalCorrectScaled / numData
val lrPredictionsVsTrue = scaledData.map { point => (lrModelScaled.predict(point.features), point.label)}


In [101]:
val lrMetricsScaled = new BinaryClassificationMetrics(lrPredictionsVsTrue)

In [102]:
val lrPr = lrMetricsScaled.areaUnderPR

In [109]:
lrModelScaled.getClass.getSimpleName

LogisticRegressionModel

In [110]:
lrAccuracyScaled * 100

62.04192021636241

In [116]:
println("Area under PR", lrPr * 100.0) 

(Area under PR,72.72540762713375)


In [117]:
println("Area under ROC:", lrRoc * 100)

(Area under ROC:,61.96629669112512)


In [118]:
val categories = records.map(r => r(3)).distinct.collect.zipWithIndex.toMap

In [119]:
val numCategories = categories.size

In [120]:
println(categories)

Map("weather" -> 0, "sports" -> 1, "unknown" -> 10, "computer_internet" -> 11, "?" -> 8, "culture_politics" -> 9, "religion" -> 4, "recreation" -> 7, "arts_entertainment" -> 5, "health" -> 12, "law_crime" -> 6, "gaming" -> 13, "business" -> 2, "science_technology" -> 3)


In [121]:
println(numCategories)

14


In [122]:
val dataCategories = records.map { r => val trimmed = r.map(_.replaceAll ("\"", ""))
val label = trimmed(r.size -1).toInt
val categoryIdx = categories(r(3))
val categoryFeatures = Array.ofDim[Double](numCategories)
categoryFeatures(categoryIdx) = 1.0
val otherFeatures = trimmed.slice(4, r.size -1).map(d => if (d == "?") 0.0 else d.toDouble)
val features = categoryFeatures ++ otherFeatures
LabeledPoint(label, Vectors.dense(features))}

In [123]:
println(dataCategories.first)

(0.0,[0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])


In [125]:
val scalerCats = new StandardScaler(withMean = true, withStd = true).fit(dataCategories.map(lp => lp.features))

In [126]:
val scaledDataCats = dataCategories.map(lp => LabeledPoint(lp.label, scalerCats.transform(lp.features)))

In [127]:
println(dataCategories.first.features)

[0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575]


In [128]:
println(scaledDataCats.first.features)

[-0.02326210589837061,-0.23272797709480803,2.7207366564548514,-0.2016540523193296,-0.09914991930875496,-0.38181322324318134,-0.06487757239262681,-0.4464212047941535,-0.6807527904251456,-0.22052688457880879,-0.028494000387023734,-0.20418221057887365,-0.2709990696925828,-0.10189469097220732,1.1376473364976751,-0.08193557169294784,1.0251398128933333,-0.05586356442541853,-0.4688932531289351,-0.35430532630793654,-0.3175352172363122,0.3384507982396541,0.0,0.8288221733153222,-0.14726894334628504,0.22963982357812907,-0.14162596909880876,0.7902380499177364,0.7171947294529865,-0.29799681649642484,-0.2034625779299476,-0.03296720969690467,-0.04878112975579767,0.9400699751165406,-0.10869848852526329,-0.27882078231369967]


In [130]:
val lrModelScaledCats = LogisticRegressionWithSGD.train(scaledDataCats, numIterations)

In [131]:
val lrTotalCorrectScaledCats = scaledDataCats.map { point => if (lrModelScaledCats.predict(point.features) == point.label) 1 else 0}.sum

In [132]:
val lrAccuracyScaledCats = lrTotalCorrectScaledCats / numData

In [133]:
val lrPredictionsVsTrueCats = scaledDataCats.map { point => (lrModelScaledCats.predict(point.features),
point.label)}

In [134]:
val lrMetricsScaledCats = new BinaryClassificationMetrics(lrPredictionsVsTrueCats)

In [136]:
val lrPrCats = lrMetricsScaledCats.areaUnderPR

In [137]:
val lrRocCats = lrMetricsScaledCats.areaUnderROC

In [140]:
lrModelScaledCats.getClass.getSimpleName


LogisticRegressionModel

In [141]:
lrAccuracyScaledCats * 100

66.57200811359026

In [142]:
lrPrCats * 100

75.79640787676577

In [143]:
lrRocCats * 100

66.54826844243996

By application a feature standardization transformation to the data, we improve both the accuracy and AUC measure from 50 percent to 62 percent

In [144]:
val dataNB = records.map { r => 
val trimmed = r.map(_.replaceAll("\"", ""))
val label = trimmed(r.size - 1).toInt
val categoryIdx = categories(r(3))
val categoryFeatures = Array.ofDim[Double](numCategories)
categoryFeatures(categoryIdx) = 1.0
LabeledPoint(label, Vectors.dense(categoryFeatures))}

In [146]:
val nbModelCats = NaiveBayes.train(dataNB)
val nbTotalCorrectCats = dataNB.map { point => 
if (nbModelCats.predict(point.features) == point.label) 1 else 0}.sum
val nbAccuracyCats = nbTotalCorrectCats / numData

In [148]:
val nbPredictionsVsTrueCats = dataNB.map {point => (nbModelCats.predict(point.features), point.label)}

In [149]:
val nbMetricsCats = new BinaryClassificationMetrics(nbPredictionsVsTrueCats)

In [152]:
val nbPrCats = nbMetricsCats.areaUnderPR

In [153]:
val nbRocCats = nbMetricsCats.areaUnderROC

In [154]:
nbModelCats.getClass.getSimpleName

NaiveBayesModel

In [155]:
nbAccuracyCats * 100

60.96010818120352

In [156]:
nbPrCats * 100

74.05222106704076

In [157]:
nbRocCats * 100

60.513849415494455

In [160]:
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.optimization.Updater

In [166]:
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.optimization.Updater
import org.apache.spark.mllib.optimization.SimpleUpdater
import org.apache.spark.mllib.optimization.L1Updater
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.classification.ClassificationModel

In [167]:
import org.apache.spark.mllib.tree.impurity.Impurity
   import org.apache.spark.mllib.tree.impurity.Entropy
   import org.apache.spark.mllib.tree.impurity.Gini
   def trainDTWithParams(input: RDD[LabeledPoint], maxDepth: Int,
   impurity: Impurity) = {
     DecisionTree.train(input, Algo.Classification, impurity,
   maxDepth)
   }


## Make Decision Trees

In [168]:
val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20).map { param =>
     val model = trainDTWithParams(data, param, Entropy)
     val scoreAndLabels = data.map { point =>
       val score = model.predict(point.features)
       (if (score > 0.5) 1.0 else 0.0, point.label)
     }
     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
  (s"$param tree depth", metrics.areaUnderROC)
}


In [169]:
dtResultsEntropy.foreach{case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}

1 tree depth, AUC = 59.33%
2 tree depth, AUC = 61.68%
3 tree depth, AUC = 62.61%
4 tree depth, AUC = 63.63%
5 tree depth, AUC = 64.88%
10 tree depth, AUC = 76.26%
20 tree depth, AUC = 98.45%


In [171]:
def trainNBWithParams(input: RDD[LabeledPoint], lambda: Double) = {
     val nb = new NaiveBayes
     nb.setLambda(lambda)
     nb.run(input)
   }
   


60.513849415494455
60.513849415494455
60.513849415494455
60.513849415494455
60.513849415494455


In [172]:
val nbResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param =>
     val model = trainNBWithParams(dataNB, param)
     val scoreAndLabels = dataNB.map { point =>
       (model.predict(point.features), point.label)
     }
     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
     (s"$param lambda", metrics.areaUnderROC)
   }

In [179]:
nbResults

List((0.001 lambda,0.6051384941549446), (0.01 lambda,0.6051384941549446), (0.1 lambda,0.6051384941549446), (1.0 lambda,0.6051384941549446), (10.0 lambda,0.6051384941549446))

### Split the data set into 80 and 20

In [180]:
val trainTestSplit = scaledDataCats.randomSplit(Array(0.8, 0.2), 123)

In [181]:
val train = trainTestSplit(0)

In [182]:
val test = trainTestSplit(1)

In [183]:
train

MapPartitionsRDD[1076] at randomSplit at <console>:73

In [184]:
test

MapPartitionsRDD[1077] at randomSplit at <console>:73

### Unsupervised Leraning with Scala

In [189]:
val movies = sc.textFile("ml-100k/u.item")

In [190]:
println(movies.first)

1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0


In [191]:
movies.take(3)

Array(1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0, 2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0, 3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0)

In [192]:
val genres = sc.textFile("ml-100k/u.genre")

In [193]:
genres.take(5).foreach(println)

unknown|0
Action|1
Adventure|2
Animation|3
Children's|4


In [194]:
val genreMap = genres.filter(!_.isEmpty).map(line => line.split("\\|")).map(array => (array(1), array(0))).collectAsMap

In [195]:
println(genreMap)

Map(2 -> Adventure, 5 -> Comedy, 12 -> Musical, 15 -> Sci-Fi, 8 -> Drama, 18 -> Western, 7 -> Documentary, 17 -> War, 1 -> Action, 4 -> Children's, 11 -> Horror, 14 -> Romance, 6 -> Crime, 0 -> unknown, 9 -> Fantasy, 16 -> Thriller, 3 -> Animation, 10 -> Film-Noir, 13 -> Mystery)


In [197]:
val titlesAndGenres = movies.map(_.split("\\|")).map { array =>
     val genres = array.toSeq.slice(5, array.size)
     val genresAssigned = genres.zipWithIndex.filter { case (g, idx)
     =>
       g == "1"
     }.map { case (g, idx) =>
       genreMap(idx.toString)
     }
     (array(0).toInt, (array(1), genresAssigned))
   }
   println(titlesAndGenres.first)


(1,(Toy Story (1995),ArrayBuffer(Animation, Children's, Comedy)))


In [199]:
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating

In [200]:
val rawData = sc.textFile("ml-100k/u.data")

In [201]:
val rawRatings = rawData.map(_.split("\t").take(3))

In [203]:
val ratings = rawRatings.map { case Array(user, movie, rating) => 
Rating(user.toInt, movie.toInt, rating.toDouble)}

In [204]:
ratings.cache

MapPartitionsRDD[1106] at map at <console>:67

In [205]:
val alsModel = ALS.train(ratings, 50, 10, 0.1)

In [206]:
import org.apache.spark.mllib.linalg.Vectors

In [207]:
val movieFactors = alsModel.productFeatures.map { case (id, factor) => (id, Vectors.dense(factor))}

In [208]:
val movieVectors = movieFactors.map(_._2)

In [209]:
val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor))}

In [210]:
val userVectors = userFactors.map(_._2)

In [211]:
import org.apache.spark.mllib.linalg.distributed.RowMatrix

In [212]:
val movieMatrix = new RowMatrix(movieVectors)

In [214]:

val movieMatrixSummary = movieMatrix.computeColumnSummaryStatistics()

In [215]:
val userMatrix = new RowMatrix(userVectors)

In [216]:
val userMatrixSummary = userMatrix.computeColumnSummaryStatistics()

In [218]:
import org.apache.spark.mllib.clustering.KMeans

In [219]:
val numClusters = 5
val numIterations = 10
val numRuns = 3

In [221]:
val movieClusterModel = KMeans.train(movieVectors, numClusters, numIterations, numRuns)

In [222]:
val movieClusterModelConverged = KMeans.train(movieVectors, numClusters, 100)

In [223]:
val userClusterModel = KMeans.train(userVectors, numClusters, numIterations, numRuns)

In [224]:
val movie1 = movieVectors.first

In [225]:
val movieCluster = movieClusterModel.predict(movie1)

In [226]:
println(movieCluster)

2


In [227]:
val predictions = movieClusterModel.predict(movieVectors)

In [229]:
println(predictions.take(10).mkString(","))

2,0,1,2,0,4,1,2,4,4


In [230]:
import breeze.linalg._

In [231]:
import breeze.numerics.pow

In [232]:
def computeDistance(v1: DenseVector[Double], v2: DenseVector[Double]) = pow(v1 - v2, 2).sum

In [233]:
val titlesWithFactors = titlesAndGenres.join(movieFactors)

In [234]:
val moviesAssigned = titlesWithFactors.map { case (id, ((title, genres), vector)) =>
val pred = movieClusterModel.predict(vector)
val clusterCentre = movieClusterModel.clusterCenters(pred)
val dist = computeDistance(DenseVector(clusterCentre.toArray),
DenseVector(vector.toArray))
(id, title, genres.mkString(" "), pred, dist)}

In [235]:
val clusterAssignments = moviesAssigned.groupBy { case (id, title, genres, cluster, dist) => cluster }.collectAsMap

In [236]:
 for ( (k, v) <- clusterAssignments.toSeq.sortBy(_._1)) {
     println(s"Cluster $k:")
     val m = v.toSeq.sortBy(_._5)
     println(m.take(20).map { case (_, title, genres, _, d) =>
     (title, genres, d) }.mkString("\n"))
     println("=====\n")
   }

Cluster 0:
(Angela (1995),Drama,0.2964825418494167)
(Outlaw, The (1943),Western,0.3515174877581781)
(Moonlight and Valentino (1995),Drama Romance,0.3630744748381558)
(Blue Chips (1994),Drama,0.3916148786616742)
(River Wild, The (1994),Action Thriller,0.4053186058085177)
(Outbreak (1995),Action Drama Thriller,0.41848519313578264)
(Mr. Wonderful (1993),Comedy Romance,0.4194677646646254)
(Johns (1996),Drama,0.439075559358961)
(Intimate Relations (1996),Comedy,0.46355817571061037)
(Commandments (1997),Romance,0.4672255558676437)
(Mr. Jones (1993),Drama Romance,0.47394725312880887)
(Prefontaine (1997),Drama,0.5307310846464643)
(Target (1995),Action Drama,0.5502954860406104)
(Touch (1997),Romance,0.562571343768202)
(City of Angels (1998),Romance,0.5631642408060491)
(Wedding Gift, The (1994),Drama,0.5640987670177126)
(Air Up There, The (1994),Comedy,0.5644833209351477)
(Courage Under Fire (1996),Drama War,0.5646097501325259)
(Sword in the Stone, The (1963),Animation Children's,0.5670074431516

In [238]:
val movieCost = movieClusterModel.computeCost(movieVectors)

In [239]:
val userCost = userClusterModel.computeCost(userVectors)

In [240]:
println("WCSS for movies: " + movieCost)

WCSS for movies: 2316.6197817001803


In [241]:
println("WCSS for users: " + userCost)

WCSS for users: 1481.0798402299226


In [245]:
 val trainTestSplitMovies = movieVectors.randomSplit(Array(0.8, 0.2),
   123)
   val trainMovies = trainTestSplitMovies(0)
   val testMovies = trainTestSplitMovies(1)
   val costsMovies = Seq(2, 3, 4, 5, 10, 20).map { k => (k, KMeans.
   train(trainMovies, numIterations, k, numRuns).computeCost(testMovies))
   }
   


In [246]:
costsMovies

List((2,436.9016801785817), (3,434.70775633366065), (4,430.4785556945895), (5,428.15798107816664), (10,425.6960115549409), (20,422.5257448838401))

In [247]:
val trainTestSplitUsers = userVectors.randomSplit(Array(0.6, 0.4),
   123)
   val trainUsers = trainTestSplitUsers(0)
   val testUsers = trainTestSplitUsers(1)
   val costsUsers = Seq(2, 3, 4, 5, 10, 20).map { k => (k,
   KMeans.train(trainUsers, numIterations, k,
   numRuns).computeCost(testUsers)) }

In [248]:
costsUsers

List((2,585.3008957929751), (3,578.8051306355458), (4,584.4271012316947), (5,581.4294362145508), (10,576.5521953743792), (20,576.8725315356476))

In [1]:
val path = "20news-bydate-train/*"
val rdd = sc.wholeTextFiles(path)
val text = rdd.map { case (file, text) => text }
println(text.count)

11314


In [2]:
val newsgroups = rdd.map { case (file, text) => file.split("/").takeRight(2).head}

In [3]:
val countByGroup = newsgroups.map(n => (n, 1)).reduceByKey(_ + _).collect.sortBy(-_._2).mkString("\n")

In [5]:
println(countByGroup)

(rec.sport.hockey,600)
(soc.religion.christian,599)
(rec.motorcycles,598)
(rec.sport.baseball,597)
(sci.crypt,595)
(sci.med,594)
(rec.autos,594)
(sci.space,593)
(comp.windows.x,593)
(sci.electronics,591)
(comp.os.ms-windows.misc,591)
(comp.sys.ibm.pc.hardware,590)
(misc.forsale,585)
(comp.graphics,584)
(comp.sys.mac.hardware,578)
(talk.politics.mideast,564)
(talk.politics.guns,546)
(alt.atheism,480)
(talk.politics.misc,465)
(talk.religion.misc,377)


In [6]:
val text = rdd.map {case (file, text) => text}

In [7]:
val whiteSpaceSplit = text.flatMap(t => t.split(" ").map(_.toLowerCase))

In [8]:
println(whiteSpaceSplit.distinct.count)

402978


In [9]:
println(whiteSpaceSplit.sample(true, 0.3, 42).take(100).mkString(","))

modems
organization:,amazing,know,i,they,were,while,several,but,i,now.,wright

--
,,gene@jackatak.raider.net,john.m.chung@dartmouth.edu,pb's
organization:,the,the,i
was,,if,and
compatability,i,interested,boards,a,opinion,,in,upgrade,and,<jas.93apr16125049@tigger.isi.edu>,jas@isi.edu,(jeff,writes:
>if,writes:
>if,teh,teh,iivx,iivx,does,it,a,new,new,the,iivx,with,->,c650,c650,c650,ordered,lc,lc,any,the,out,stick,them,the,and,you've,got
an,got
an,->,thing.

it,be,be,to,look,for,those,those,unhappy,of,quite,trade-in...

(-brian
,cherkaue@ee.rochester.edu
,lewallen)
subject:,68040,for,:,never,state,have,sell,still,its,a,offer,,will,use,replace,can,can,at,the
demand,the
demand,i


In [10]:
val nonWordSplit = text.flatMap(t =>
   t.split("""\W+""").map(_.toLowerCase))
   println(nonWordSplit.distinct.count)


130126


In [11]:
println(nonWordSplit.distinct.sample(true, 0.3, 42).take(10).mkString(","))

glorifying,valuemask,6611,tervio,afterward,125215,6hy,nls,isgal,donnalyn


In [12]:
val regex = """[^0-9]*""".r
val filterNumbers = nonWordSplit.filter(token => regex.pattern.matcher(token).matches)
println(filterNumbers.distinct.count)

84912


In [13]:
println(filterNumbers.distinct.sample(true, 0.3, 42).take(10).mkString(","))

ratifi,valuemask,fred,fowl,execute,relieves,bluffing,artur,entitlements,hcq


In [14]:
val tokenCounts = filterNumbers.map(t => (t, 1)).reduceByKey(_ + _)

In [16]:
val oreringDesc = Ordering.by[(String, Int), Int](_._2)

In [17]:
println(tokenCounts.top(20)(oreringDesc).mkString("\n"))

(the,146532)
(to,75064)
(of,69034)
(a,64195)
(ax,62406)
(and,57957)
(i,53036)
(in,49402)
(is,43480)
(that,39264)
(it,33638)
(for,28600)
(you,26682)
(from,22670)
(s,22337)
(edu,21321)
(on,20493)
(this,20121)
(be,19285)
(t,18728)


In [18]:
val stopwords = Set(
     "the","a","an","of","or","in","for","by","on","but", "is", "not",
   "with", "as", "was", "if",
     "they", "are", "this", "and", "it", "have", "from", "at", "my",
   "be", "that", "to"
   )
   val tokenCountsFilteredStopwords = tokenCounts.filter { case
   (k, v) => !stopwords.contains(k) }
   println(tokenCountsFilteredStopwords.top(20)(oreringDesc).mkString
   ("\n"))

(ax,62406)
(i,53036)
(you,26682)
(s,22337)
(edu,21321)
(t,18728)
(m,12756)
(subject,12264)
(com,12133)
(lines,11835)
(can,11355)
(organization,11233)
(re,10534)
(what,9861)
(there,9689)
(x,9332)
(all,9310)
(will,9279)
(we,9227)
(one,9008)


In [27]:
import breeze.linalg._

# Scala Data analysis Cookbook

In [28]:
val dense = DenseVector(1, 2, 3, 4, 5)

In [29]:
println(dense)

DenseVector(1, 2, 3, 4, 5)


In [30]:
val sparse = SparseVector(0.0, 1.0, 0.0, 2.0, 0.0)

In [31]:
println(sparse)

SparseVector((0,0.0), (1,1.0), (2,0.0), (3,2.0), (4,0.0))


In [32]:
val denseZeros = DenseVector.zeros[Double](5)

In [33]:
denseZeros

DenseVector(0.0, 0.0, 0.0, 0.0, 0.0)

In [34]:
val sparseZeros = SparseVector.zeros[Double](5)

In [35]:
sparseZeros

SparseVector()

In [36]:
val denseTabulate = DenseVector.tabulate[Double](5)(index=> index*index)

In [37]:
denseTabulate

DenseVector(0.0, 1.0, 4.0, 9.0, 16.0)

In [38]:
val spaceVector = breeze.linalg.linspace(2, 10, 5)

In [39]:
spaceVector

DenseVector(2.0, 4.0, 6.0, 8.0, 10.0)

### Creating a vector with values in a specific range

In [40]:
val allNosTill10 = DenseVector.range(0, 10)

In [41]:
val evenNosTill20 = DenseVector.range(0, 20, 2)

In [42]:
val rangeD = DenseVector.rangeD(0.5, 20, 2.5)

In [43]:
rangeD

DenseVector(0.5, 3.0, 5.5, 8.0, 10.5, 13.0, 15.5)

In [44]:
val denseJust2s = DenseVector.fill(10, 2)

In [46]:
denseJust2s

DenseVector(2, 2, 2, 2, 2, 2, 2, 2, 2, 2)

In [47]:
val allNosTill10 = DenseVector.range(0, 10)

In [48]:
allNosTill10

DenseVector(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)

In [49]:
val fourThroughSevenIndexVector = allNosTill10.slice(4, 7)

In [50]:
fourThroughSevenIndexVector

DenseVector(4, 5, 6)

In [51]:
val twoThroughNineSkip2IndexVector = allNosTill10.slice(2, 9, 2)

In [53]:
twoThroughNineSkip2IndexVector

DenseVector(2, 4, 6)

## Scala Data Analysis Cookbook (page 25 or 254)

In [54]:
val spare = SparseVector(0.0, 1.0, 0.0, 2.0, 0.0)

In [55]:
println(sparse)

SparseVector((0,0.0), (1,1.0), (2,0.0), (3,2.0), (4,0.0))


In [56]:
val denseZeros = DenseVector.zeros[Double](5)

In [57]:
val sparseZeros = SparseVector.zeros[Double](5)

In [59]:
val denseTabulate = DenseVector.tabulate[Double](5)(index=> index*index)

In [60]:
denseTabulate

DenseVector(0.0, 1.0, 4.0, 9.0, 16.0)

In [61]:
val spaceVector = breeze.linalg.linspace(2, 10, 5)

In [62]:
val spaceVector = breeze.linalg.linspace(2, 10, 5)

In [63]:
val allNosTill10 = DenseVector.range(0, 10)

In [64]:
val evenNosTill20 = DenseVector.range(0, 20, 2)

In [66]:
val rangeD = DenseVector.rangeD(0.5, 20, 2.5)

In [67]:
val denseJust2s = DenseVector.fill(10, 2)

In [68]:
val allNosTill10 = DenseVector.range(0, 10)

In [70]:
val fourThroughSevenIndexVector = allNosTill10.slice(4, 6)

In [72]:
val twoThroughNineSkip2IndexVector = allNosTill10.slice(2, 9, 2)

In [73]:
val vectFromArray = DenseVector(collection.immutable.Vector(1, 2, 3, 4))

In [74]:
vectFromArray

DenseVector(Vector(1, 2, 3, 4))

#### Scalar operations

In [76]:
val inPlaceValueAddition = evenNosTill20 + 2

In [77]:
evenNosTill20

DenseVector(0, 2, 4, 6, 8, 10, 12, 14, 16, 18)

In [79]:
evenNosTill20 + 2

DenseVector(2, 4, 6, 8, 10, 12, 14, 16, 18, 20)

In [80]:
//Scalar subtraction
val inPlaceValueSubtraction = evenNosTill20 - 2

In [81]:
//Scalar multiplcation
val inPlaceValueMultiplication = evenNosTill20 * 2

In [82]:
//Scalar division
val inPlaceValueDivision = evenNosTill20 / 2

In [84]:
val justFive2s = DenseVector.fill(5, 2)

In [85]:
val zeroThrough4 = DenseVector.range(0, 5, 1)

In [86]:
val dotVector = zeroThrough4.dot(justFive2s)

In [87]:
val evenNosTill20 = DenseVector.range(0, 20, 2)

In [88]:
val denseJust2s = DenseVector.fill(10, 2)

In [89]:
denseJust2s

DenseVector(2, 2, 2, 2, 2, 2, 2, 2, 2, 2)

In [90]:
val additionVector = evenNosTill20 + denseJust2s

In [91]:
additionVector

DenseVector(2, 4, 6, 8, 10, 12, 14, 16, 18, 20)

In [92]:
val fiveLength = DenseVector(1, 2, 3, 4, 5)

In [93]:
fiveLength

DenseVector(1, 2, 3, 4, 5)

In [94]:
val tenLength = DenseVector.fill(10, 20)

In [95]:
fiveLength + tenLength

DenseVector(21, 22, 23, 24, 25)

In [96]:
tenLength + fiveLength

Name: java.lang.ArrayIndexOutOfBoundsException
Message: 5
StackTrace: breeze.linalg.operators.DenseVectorOps$$anon$142.apply(DenseVectorOps.scala:143)
breeze.linalg.operators.DenseVectorOps$$anon$142.apply(DenseVectorOps.scala:132)
breeze.linalg.NumericOps$class.$plus(NumericOps.scala:167)
breeze.linalg.DenseVector.$plus(DenseVector.scala:50)
$line131.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:29)
$line131.$read$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:34)
$line131.$read$$iwC$$iwC$$iwC$$iwC.<init>(<console>:36)
$line131.$read$$iwC$$iwC$$iwC.<init>(<console>:38)
$line131.$read$$iwC$$iwC.<init>(<console>:40)
$line131.$read$$iwC.<init>(<console>:42)
$line131.$read.<init>(<console>:44)
$line131.$read$.<init>(<console>:48)
$line131.$read$.<clinit>(<console>)
$line131.$eval$.<init>(<console>:7)
$line131.$eval$.<clinit>(<console>)
$line131.$eval.$print(<console>)
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAcce

#### Binding 2 vectors

In [97]:
val justFive2s = DenseVector.fill(5, 2)

In [98]:
val zeroThrough4 = DenseVector.range(0, 5, 1)

In [99]:
val concatVector = DenseVector.vertcat(zeroThrough4, justFive2s)

In [100]:
concatVector

DenseVector(0, 1, 2, 3, 4, 2, 2, 2, 2, 2)

In [101]:
val evenNosTill20Double = breeze.linalg.convert(evenNosTill20, Double)

In [102]:
evenNosTill20Double

DenseVector(0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0)

In [110]:
import breeze.linalg._
import breeze.numerics._
import breeze.stats._
meanAndVariance(evenNosTill20Double)

MeanAndVariance(9.0,36.666666666666664,10)

In [111]:
stddev(evenNosTill20Double)

6.0553007081949835

In [112]:
val intMaxOfVectorVals = max(evenNosTill20)

In [113]:
intMaxOfVectorVals

18

In [114]:
val intSumOfVectorVals = sum(evenNosTill20)

In [115]:
intSumOfVectorVals

90

In [116]:
val sqrtOfVectorVals = sqrt(evenNosTill20)

In [117]:
sqrtOfVectorVals

DenseVector(0.0, 1.4142135623730951, 2.0, 2.449489742783178, 2.8284271247461903, 3.1622776601683795, 3.4641016151377544, 3.7416573867739413, 4.0, 4.242640687119285)

In [118]:
val log2VectorVals = log(evenNosTill20)

In [119]:
log2VectorVals

DenseVector(-Infinity, 0.6931471805599453, 1.3862943611198906, 1.791759469228055, 2.0794415416798357, 2.302585092994046, 2.4849066497880004, 2.6390573296152584, 2.772588722239781, 2.8903717578961645)

In [120]:
val simpleMatrix = DenseMatrix((1, 2, 3), (11, 12, 13), (21, 22, 23))

In [121]:
simpleMatrix

1   2   3   
11  12  13  
21  22  23

In [122]:
val sparseMatrix = CSCMatrix((1, 0, 0), (11, 0, 0), (0, 0, 23))

In [124]:
sparseMatrix

3 x 3 CSCMatrix
(0,0) 1
(1,0) 11
(2,2) 23

In [125]:
val denseZeros = DenseMatrix.zeros[Double](5, 4)

In [127]:
denseZeros

0.0  0.0  0.0  0.0  
0.0  0.0  0.0  0.0  
0.0  0.0  0.0  0.0  
0.0  0.0  0.0  0.0  
0.0  0.0  0.0  0.0

In [128]:
val compressedSparseMatrix = CSCMatrix.zeros[Double](5, 4)

In [129]:
compressedSparseMatrix

5 x 4 CSCMatrix

In [130]:
val denseTabulate = DenseMatrix.tabulate[Double](5, 4)((firstIdx, secondIdx) => firstIdx*secondIdx)

In [131]:
denseTabulate

0.0  0.0  0.0  0.0   
0.0  1.0  2.0  3.0   
0.0  2.0  4.0  6.0   
0.0  3.0  6.0  9.0   
0.0  4.0  8.0  12.0

In [132]:
val denseTabulate = DenseMatrix.tabulate(5,4)((firstIdx, secondIdx) => firstIdx*secondIdx)

In [133]:
denseTabulate

0  0  0  0   
0  1  2  3   
0  2  4  6   
0  3  6  9   
0  4  8  12

In [136]:
val identityMatrix = DenseMatrix.eye[Int](3)

In [137]:
identityMatrix

1  0  0  
0  1  0  
0  0  1

### Creating a matrix from random numbers

In [138]:
val randomMatrix = DenseMatrix.rand(4, 4)

In [139]:
randomMatrix

0.13501313258836567  0.5514072929798093   0.8423752700734157   0.28572302572419606  
0.7021385786409526   0.43342650782634506  0.19461402867562838  0.8304155183273774   
0.4430021859699995   0.9944828099441547   0.2656838486036104   0.2956269229443813   
0.22286685297725106  0.18029565999949337  0.44423129462028443  0.24429591493357217

In [140]:
val vectFromArray = new DenseMatrix(2, 2, Array(2, 3, 4, 5))

In [141]:
vectFromArray

2  4  
3  5

In [143]:
val vectFromArray = new DenseMatrix(2, 2, Array(2, 3, 4, 5, 6, 7))

In [145]:
vectFromArray

2  4  
3  5

In [147]:
val VectoFromArrayIobe = new DenseMatrix(2, 2, Array(2, 3, 4))

Name: java.lang.ArrayIndexOutOfBoundsException
Message: 3
StackTrace: breeze.linalg.DenseMatrix$mcI$sp.apply$mcI$sp(DenseMatrix.scala:87)
breeze.linalg.DenseMatrix$mcI$sp.apply(DenseMatrix.scala:82)
breeze.linalg.DenseMatrix$mcI$sp.apply(DenseMatrix.scala:53)
breeze.linalg.Matrix$$anonfun$colWidth$1$1.apply$mcII$sp(Matrix.scala:71)
breeze.linalg.Matrix$$anonfun$colWidth$1$1.apply(Matrix.scala:71)
breeze.linalg.Matrix$$anonfun$colWidth$1$1.apply(Matrix.scala:71)
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
scala.collection.immutable.Range.foreach(Range.scala:141)
scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
scala.collection.AbstractTraversable.map(Traversable.scala:105)
breeze.linalg.Matrix$class.colWidth$1(Matrix.scala:71)
breeze.linalg.Matrix$class.toString(Matrix.scala:76)
breeze.linalg.DenseMatrix.toString(DenseMatrix.scala:53)
breeze.linalg.M

### Matrix arithmetic

In [149]:
val simpleMatrix = DenseMatrix((1, 2, 3), (11, 12, 13), (21, 22, 23))

In [150]:
simpleMatrix

1   2   3   
11  12  13  
21  22  23

In [151]:
val identityMatrix = DenseMatrix.eye[Int](3)

In [152]:
identityMatrix

1  0  0  
0  1  0  
0  0  1

In [154]:
val additionMatrix = identityMatrix + simpleMatrix

In [155]:
additionMatrix

2   2   3   
11  13  13  
21  22  24

In [156]:
val simpleTimesIdentity = simpleMatrix * identityMatrix

In [157]:
simpleTimesIdentity

1   2   3   
11  12  13  
21  22  23

In [160]:
val elementWiseMulti = identityMatrix :* simpleMatrix

In [161]:
elementWiseMulti

1  0   0   
0  12  0   
0  0   23

### Appending and conversion 
#### Concatenating matrix - vertically

In [162]:
val vertConcatMatrix = DenseMatrix.vertcat(identityMatrix, simpleMatrix)

In [163]:
vertConcatMatrix

1   0   0   
0   1   0   
0   0   1   
1   2   3   
11  12  13  
21  22  23

### Concetenating matrices - horizontally

In [164]:
val horzConcatMatrix = DenseMatrix.horzcat(identityMatrix, simpleMatrix)

In [165]:
horzConcatMatrix

1  0  0  1   2   3   
0  1  0  11  12  13  
0  0  1  21  22  23

In [166]:
3 * 3

9

In [167]:
import breeze.linalg.convert

In [168]:
val simpleMatrixAsDouble = convert(simpleMatrix, Double)

In [169]:
simpleMatrixAsDouble

1.0   2.0   3.0   
11.0  12.0  13.0  
21.0  22.0  23.0

In [171]:
val simpleMatrix=DenseMatrix((4.0,7.0),(3.0,-5.0))

In [172]:
simpleMatrix

4.0  7.0   
3.0  -5.0

In [173]:
val firstVector = simpleMatrix(::, 0)

In [174]:
val secondVector = simpleMatrix(::, 1)

In [175]:
val firstVectorByCols = simpleMatrix(0 to 1, 0)

In [176]:
firstVectorByCols

DenseVector(4.0, 3.0)

In [177]:
val errorTryingToSelect3ColumnsOn2By2Matrix = simpleMatrix(0, 0 to 2)

Name: java.lang.ArrayIndexOutOfBoundsException
Message: 4
StackTrace: scala.runtime.ScalaRunTime$.array_apply(ScalaRunTime.scala:74)
breeze.linalg.DenseVector.apply$mcI$sp(DenseVector.scala:73)
breeze.linalg.DenseVector.apply(DenseVector.scala:70)
breeze.linalg.DenseVector.apply(DenseVector.scala:50)
breeze.linalg.Vector$$anonfun$valuesIterator$1.apply(Vector.scala:62)
breeze.linalg.Vector$$anonfun$valuesIterator$1.apply(Vector.scala:62)
scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
scala.collection.Iterator$class.foreach(Iterator.scala:727)
scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
scala.collection.TraversableOnce$class.addString(TraversableOnce.scala:320)
scala.collection.AbstractIterator.addString(Iterator.scala:1157)
scala.collection.TraversableOnce$class.mkString(TraversableOnce.scala:286)
scala.collection.AbstractIterator.mkString(Iterator.scala:1157)
breeze.linalg.DenseVector.toString(DenseVector.scala:107)
java.lang.String.valueOf(String.java:

In [178]:
val firstRowStatingCols = simpleMatrix(0, 0 to 1)

In [179]:
val firstRowAllCols = simpleMatrix(0, ::)

In [180]:
val secondrow = simpleMatrix(1, ::)

In [181]:
val firstRowFirstCol = simpleMatrix(0, 0)

In [182]:
val simpleMatrix = DenseMatrix((4.0, 7.0), (3.0, -5.0))

In [183]:
val transpose = simpleMatrix.t

In [185]:
transpose

4.0  3.0   
7.0  -5.0

In [186]:
val inverse = inv(simpleMatrix)

In [187]:
inverse

0.12195121951219512  0.17073170731707318  
0.07317073170731708  -0.0975609756097561

In [188]:
simpleMatrix * inverse

1.0                     0.0  
-5.551115123125783E-17  1.0

In [189]:
import breeze.linalg._

In [190]:
import breeze.numerics._

In [None]:
import breeze.stats._