In [40]:
#spark MLlib
#spark sql

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.ml import feature 
from pyspark.ml import classification

spark = SparkSession\
    .builder\
    .appName("example-spark")\
    .config("spark.sql.crossJoin.enabled","true")\
    .getOrCreate()

In [2]:
heroes = spark.createDataFrame([("Luke", 53, True), ("Leia", 53, False), ("Han", 64, False),("Rei", 19,True)],schema=["name","age","jedi"])

In [3]:
heroes.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- jedi: boolean (nullable = true)



In [4]:
heroes.show()

+----+---+-----+
|name|age| jedi|
+----+---+-----+
|Luke| 53| true|
|Leia| 53|false|
| Han| 64|false|
| Rei| 19| true|
+----+---+-----+



In [5]:
heroes.filter(heroes.jedi == True).show()

+----+---+----+
|name|age|jedi|
+----+---+----+
|Luke| 53|true|
| Rei| 19|true|
+----+---+----+



In [6]:
heroes.filter(heroes.jedi == False).filter(heroes.age>60).show()

+----+---+-----+
|name|age| jedi|
+----+---+-----+
| Han| 64|false|
+----+---+-----+



In [7]:
heroes.select("name", "age").orderBy("age").show()

+----+---+
|name|age|
+----+---+
| Rei| 19|
|Leia| 53|
|Luke| 53|
| Han| 64|
+----+---+



In [8]:
heroes.select("name", "age").orderBy("age").limit(3).show()

+----+---+
|name|age|
+----+---+
| Rei| 19|
|Luke| 53|
|Leia| 53|
+----+---+



In [9]:
#creation vu pour requete sql
heroes.createTempView("heroes_view")
spark.sql("SELECT * FROM heroes_view ORDER BY name LIMIT 2").show()

+----+---+-----+
|name|age| jedi|
+----+---+-----+
| Han| 64|false|
|Leia| 53|false|
+----+---+-----+



In [10]:
#Passage DF => RDD
heroes.rdd

MapPartitionsRDD[42] at javaToPython at NativeMethodAccessorImpl.java:0

In [11]:
row = heroes.rdd.first()
print( row.name, row.age, row.jedi)

Luke 53 True


In [12]:
Row(name="Jyn", jedi=False)

Row(jedi=False, name='Jyn')

In [11]:
####avec l'iliad chargement des mots
sc = spark.sparkContext
iliad_rdd = sc.textFile('..\data\iliad.mb.txt').flatMap(lambda line: line.split()).map(lambda word: word.strip(',.;:?!-"'))
#creation des rows 
iliad_rows = iliad_rdd.map(lambda word: Row(word=word))
iliad_rows
## RDD => DF
iliad = spark.createDataFrame(iliad_rows)


In [12]:
iliad.orderBy("word", ascending=False).distinct().show(10)

+----------+
|      word|
+----------+
|      zeal|
|    youths|
|     youth|
| youselves|
|yourselves|
|  yourself|
|     yours|
|   yourelf|
|      your|
|  youngest|
+----------+
only showing top 10 rows



In [23]:
####Iliad odyssee
iliad = sc.textFile('..\data\iliad.mb.txt').\
    map(lambda line: line.split()).\
    map(lambda words: [w.strip(',.;:?!-"') for w in words])

In [24]:
for line in iliad.takeSample(False, 10):
    print(line)

['Thus', 'did', 'he', 'speak', 'and', 'his', 'brother', 'was', 'persuaded', 'by', 'him', 'for', 'his', 'words']
['clouds', 'of', 'dust', 'they', 'raised', 'and', 'the', 'horses', 'strained', 'every', 'nerve', 'in']
['Achilles', 'answered', 'Ulysses', 'noble', 'son', 'of', 'Laertes', 'I', 'should', 'give', 'you']
['On', 'this', 'the', 'mighty', 'monster', 'hobbled', 'off', 'from', 'his', 'anvil', 'his', 'thin', 'legs']
['he', 'would', 'give', 'me', 'the', 'horses', 'of', 'the', 'noble', 'son', 'of', 'Peleus', 'and', 'his', 'bronze-bedizened']
['of', 'Achilles', 'under', 'the', 'walls', 'of', 'Troy', 'Therefore', 'be', 'staunch', 'and', 'urge']
['and', 'as', 'many', 'flocks', 'of', 'sheep', 'fifty', 'droves', 'also', 'of', 'pigs', 'and', 'as', 'many']
['Thus', 'did', 'he', 'stand', 'and', 'ponder', 'but', 'Achilles', 'came', 'up', 'to', 'him', 'as', 'it', 'were']
['your', 'neck', 'nor', 'back', 'but', 'the', 'weapon', 'would', 'hit', 'you', 'in', 'the', 'chest', 'or', 'belly']
['Thus', '

In [25]:
odyssey = sc.textFile('..\data\odyssey.mb.txt').\
    map(lambda line: line.split()).\
    map(lambda words: [w.strip(',.;:?!-"') for w in words])

In [26]:
for line in iliad.takeSample(False, 10):
    print(line)

['could', 'neither', 'spring', 'forward', 'to', 'recover', 'his', 'own', 'weapon', 'nor', 'swerve']
['With', 'these', 'words', 'he', 'moved', 'the', 'heart', 'of', 'Aeneas', 'and', 'he', 'went', 'in', 'pursuit']
['others', 'shall', 'not', 'be', 'of', 'a', 'mind', 'with', 'you']
['Antilochus', 'was', 'more', 'angry', 'than', 'any', 'one', 'but', 'grief', 'did', 'not', 'make', 'him']
['still', 'bid', 'both', 'come', 'for', 'it', 'will', 'be', 'all', 'over', 'with', 'us', 'here', 'directly']
['of', 'Polydamas', 'but', 'Asius', 'son', 'of', 'Hyrtacus', 'would', 'not', 'leave', 'his', 'horses']
['smoke', 'Then', 'he', 'stood', 'up', 'and', 'spoke', 'among', 'the', 'Argives', 'saying']
['the', 'other', 'gods', 'for', 'they', 'were', 'of', 'divided', 'counsels', 'They', 'fell', 'on', 'one']
['of', 'Hades']
['sacrifices', 'of', 'bulls', 'and', 'rams', 'These', 'were', 'commanded', 'by', 'Menestheus']


In [27]:
iliad = iliad.map(lambda words: Row(label=0,words=words))
odyssey = odyssey.map(lambda words: Row(label=1,words=words))

In [29]:
df = spark.createDataFrame(iliad.union(odyssey))

In [38]:
#mot => floatant // sac de mots
vectorizer = feature.CountVectorizer(inputCol="words", outputCol="bag_of_words").fit(df)
features = vectorizer.transform(df)
features.printSchema()

root
 |-- label: long (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- bag_of_words: vector (nullable = true)



In [39]:
#training /test set
train, test = features.randomSplit([0.75, 0.25])

In [42]:
#apprentissage bayesien naif
classifier = classification.NaiveBayes(labelCol="label", featuresCol="bag_of_words", predictionCol="label_predicted").fit(train)

In [43]:
#application
predicted = classifier.transform(test)

In [45]:
#precision
accuracy = predicted.filter(predicted.label_predicted == predicted.label).count()/ float(predicted.count())
print(accuracy)

0.783252871649742
