In [5]:
from pyspark.sql import SparkSession
import os 

spark = SparkSession.\
        builder.\
        appName("AuthorshipIdentificationWithPyspark").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1g").\
        getOrCreate()

sc = spark.sparkContext

In [6]:
sc.setLogLevel("WARN")

##### Lines for each book

In [7]:
BASH_PATH = './data/'

folders = [i for i in os.listdir(BASH_PATH) if not i.startswith('.') ] 
authors, texts, books = [], [], []
for folder in folders :
    files = [f for f in os.listdir(BASH_PATH+folder) if not f.startswith(".")]
    for file in files:
        with open(BASH_PATH+folder+"/"+file, 'r') as io_file :
            lines = io_file.readlines()
            print(file, len(lines))

lafille2.txt 645
elixir2.txt 203
chef2.txt 334
chabert3.txt 883
uncoeur3.txt 503
bovary3.txt 6047
bouvard2.txt 6569
educati1.txt 10149
salammb1.txt 4109


In [8]:
# install dependencies
# !pip install pandas 
# !pip install numpy
# !pip install scikit-learn

In [9]:
import os 
import pandas as pd 
import numpy as np
import re

In [10]:
# Create Dataset
def create_dataset():
    # base path for data 
    BASH_PATH = './data/'

    folders = [i for i in os.listdir(BASH_PATH) if not i.startswith('.') ] 
    authors, texts, books = [], [], []
    for folder in folders :
        files = [f for f in os.listdir(BASH_PATH+folder) if not f.startswith(".")]
        for file in files:
            with open(BASH_PATH+folder+"/"+file, 'r') as io_file :
                lines = io_file.readlines()
                i=0
                while i<len(lines) :
                    if i+50> len(lines):
                        text = lines[i:]
                        break
                    text = "\n".join(lines[i: i+50])
                    i+=50
                    text = re.sub('\W+|_', ' ', text)
                    texts.append(text)
                    books.append(file.split(".")[0])
                    authors.append(folder)
                      
                
    pd.DataFrame({"text":texts, "author":authors, "book":books}).to_csv("data.csv", index=False)
                        
# create dataset
create_dataset()


In [11]:
df = spark.read\
    .format("csv")\
    .option("header", "true")\
    .load("data.csv")

df.show(n=5)

                                                                                

+--------------------+------+--------+
|                text|author|    book|
+--------------------+------+--------+
|CHAPITRE I PHYSIO...|Balzec|lafille2|
| Je m étonne mon ...|Balzec|lafille2|
| Ah dit le facteu...|Balzec|lafille2|
| Drôle répondit H...|Balzec|lafille2|
| Pantoufle il y e...|Balzec|lafille2|
+--------------------+------+--------+
only showing top 5 rows



In [12]:
df.columns

['text', 'author', 'book']

In [13]:
df = df.select("text", "author")

In [14]:
df.show(n=3)

+--------------------+------+
|                text|author|
+--------------------+------+
|CHAPITRE I PHYSIO...|Balzec|
| Je m étonne mon ...|Balzec|
| Ah dit le facteu...|Balzec|
+--------------------+------+
only showing top 3 rows



In [15]:
df.groupBy("author").count().show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------+-----+
|  author|count|
+--------+-----+
|  Balzec|   39|
|Flaubert|  545|
+--------+-----+



                                                                                

In [16]:
import pyspark.ml.feature

In [17]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, IDF, CountVectorizer
from pyspark.ml.feature import StringIndexer 

In [18]:
tokenizer = Tokenizer(inputCol="text", outputCol="mytokens")
stopWordsRemover = StopWordsRemover(inputCol="mytokens", outputCol="filtered_tokens")
vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="vectorizedFeatures")

In [19]:
labelEncoder = StringIndexer(inputCol="author", outputCol="label").fit(df)

                                                                                

In [20]:
df = labelEncoder.transform(df)

In [21]:
(trainDF, testDF) = df.randomSplit((0.6, 0.3), seed=42)

In [22]:
trainDF.show() 

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+--------+-----+
|                text|  author|label|
+--------------------+--------+-----+
| A bas les assomm...|Flaubert|  0.0|
| A côté s écria m...|Flaubert|  0.0|
| Adieu Elle ne lu...|Flaubert|  0.0|
| Ah ah dit don Ju...|  Balzec|  1.0|
| Ah c est trop fo...|Flaubert|  0.0|
| Ah dit le facteu...|  Balzec|  1.0|
| Ah la Révolution...|Flaubert|  0.0|
| Ah qu un négocia...|Flaubert|  0.0|
| Ah vous trouvez ...|Flaubert|  0.0|
| Ainsi je puis di...|Flaubert|  0.0|
| Allons bon dit H...|Flaubert|  0.0|
| Allons une prise...|Flaubert|  0.0|
| Alors je vais fa...|  Balzec|  1.0|
| Alors tu comptes...|Flaubert|  0.0|
| Approchez vous d...|Flaubert|  0.0|
| Au bout de dix m...|Flaubert|  0.0|
| Au moment de par...|Flaubert|  0.0|
| Aux vôtres C éta...|Flaubert|  0.0|
| Bah l avenir est...|Flaubert|  0.0|
| Barca nous venon...|Flaubert|  0.0|
+--------------------+--------+-----+
only showing top 20 rows



                                                                                

In [23]:
testDF.show()

+--------------------+--------+-----+
|                text|  author|label|
+--------------------+--------+-----+
| Acide de sucre f...|Flaubert|  0.0|
| Adieu dit Frédér...|Flaubert|  0.0|
| Ah bah il en a d...|Flaubert|  0.0|
| Ah c est vrai Et...|Flaubert|  0.0|
| Ah c est vrai re...|Flaubert|  0.0|
| Ah je n ai jamai...|Flaubert|  0.0|
| Ah les affiches ...|Flaubert|  0.0|
| Ah pardon monsie...|Flaubert|  0.0|
| Ah pas encore re...|Flaubert|  0.0|
| Ajoute que les v...|  Balzec|  1.0|
| Allons mon bon a...|Flaubert|  0.0|
| Allons à son ate...|  Balzec|  1.0|
| Allévy transform...|Flaubert|  0.0|
| Après trois heur...|Flaubert|  0.0|
| Arrêtez s écria ...|Flaubert|  0.0|
| Asseyez vous dit...|Flaubert|  0.0|
| Au bout de la se...|Flaubert|  0.0|
| Autre sujet d ét...|Flaubert|  0.0|
| Aux approches du...|Flaubert|  0.0|
| Bien bien monsie...|Flaubert|  0.0|
+--------------------+--------+-----+
only showing top 20 rows



### Building Logistic Regression Pipeline 
* tokenizer
* stop words removing
* vectorize words 
* IF-IDF

In [24]:
from pyspark.ml.classification import LogisticRegression

In [25]:
lr = LogisticRegression(featuresCol="vectorizedFeatures", labelCol="label")

In [30]:
from pyspark.ml import Pipeline

In [31]:
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, vectorizer, idf, lr])

In [32]:
pipeline

Pipeline_09c79452f06a

In [33]:
pipeline.stages

Param(parent='Pipeline_09c79452f06a', name='stages', doc='a list of pipeline stages')

In [34]:
lr_model = pipeline.fit(trainDF)

[Stage 17:>                                                         (0 + 1) / 1]

23/04/28 22:34:36 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/28 22:34:36 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/04/28 22:34:36 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/04/28 22:34:36 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

In [36]:
lr_model

PipelineModel_56fbb19f9550

### Evaluate Model

In [37]:
predictions = lr_model.transform(testDF)

In [38]:
predictions.columns

['text',
 'author',
 'label',
 'mytokens',
 'filtered_tokens',
 'rawFeatures',
 'vectorizedFeatures',
 'rawPrediction',
 'probability',
 'prediction']

In [39]:
predictions.select("text", "author", "label","probability", "prediction").show()

23/04/28 22:34:47 WARN DAGScheduler: Broadcasting large task binary with size 1024.8 KiB
+--------------------+--------+-----+--------------------+----------+
|                text|  author|label|         probability|prediction|
+--------------------+--------+-----+--------------------+----------+
| Acide de sucre f...|Flaubert|  0.0|[0.99999999926659...|       0.0|
| Adieu dit Frédér...|Flaubert|  0.0|[0.99999999999246...|       0.0|
| Ah bah il en a d...|Flaubert|  0.0|[2.30360532082317...|       1.0|
| Ah c est vrai Et...|Flaubert|  0.0|[0.99999999998176...|       0.0|
| Ah c est vrai re...|Flaubert|  0.0|[0.99999999995752...|       0.0|
| Ah je n ai jamai...|Flaubert|  0.0|[0.99999986978351...|       0.0|
| Ah les affiches ...|Flaubert|  0.0|[0.99999999032948...|       0.0|
| Ah pardon monsie...|Flaubert|  0.0|[0.99999787857289...|       0.0|
| Ah pas encore re...|Flaubert|  0.0|[0.99999999916960...|       0.0|
| Ajoute que les v...|  Balzec|  1.0|[9.10402763909350...|       1.0|
|

In [40]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [41]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [42]:
accuracy = evaluator.evaluate(predictions)

23/04/28 22:34:52 WARN DAGScheduler: Broadcasting large task binary with size 1032.6 KiB


                                                                                

In [43]:
accuracy

0.9763313609467456

In [44]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [45]:
lr_metric = MulticlassMetrics(predictions["label", "prediction"].rdd)



23/04/28 22:35:00 WARN DAGScheduler: Broadcasting large task binary with size 1021.1 KiB


                                                                                

In [46]:
# accuracy
print("Accuracy: ", lr_metric.accuracy)
# precision
print("Precision: ", lr_metric.precision(1.0))
# recall
print("Recall: ", lr_metric.recall(1.0))
# f1socre
print("F1Score: ", lr_metric.fMeasure(1.0))

23/04/28 22:35:02 WARN DAGScheduler: Broadcasting large task binary with size 1032.4 KiB
Accuracy:  0.9763313609467456
Precision:  1.0
Recall:  0.6666666666666666
F1Score:  0.8


                                                                                

In [47]:
y_true = predictions.select("label")
y_true = y_true.toPandas()
y_pred = predictions.select("prediction")
y_pred = y_pred.toPandas()

23/04/28 22:35:06 WARN DAGScheduler: Broadcasting large task binary with size 1016.5 KiB


In [48]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_true, y_pred)

In [50]:
cm

array([[157,   4],
       [  0,   8]])

### Create Pipeline For Naive Bayes Model 

In [52]:
from pyspark.ml.classification import 

In [53]:
nb = NaiveBayes(featuresCol="vectorizedFeatures", labelCol="label")

In [56]:
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, vectorizer, idf, nb])

In [57]:
nb_model = pipeline.fit(trainDF)

                                                                                

In [63]:
predictions = nb_model.transform(testDF)

In [64]:
predictions.select("text", "author", "label","probability", "prediction").show()

23/04/28 22:40:41 WARN DAGScheduler: Broadcasting large task binary with size 1230.4 KiB
+--------------------+--------+-----+--------------------+----------+
|                text|  author|label|         probability|prediction|
+--------------------+--------+-----+--------------------+----------+
| Acide de sucre f...|Flaubert|  0.0|           [1.0,0.0]|       0.0|
| Adieu dit Frédér...|Flaubert|  0.0|[1.0,8.0397233671...|       0.0|
| Ah bah il en a d...|Flaubert|  0.0|[1.0,4.4884203975...|       0.0|
| Ah c est vrai Et...|Flaubert|  0.0|[1.0,2.7080395496...|       0.0|
| Ah c est vrai re...|Flaubert|  0.0|[1.0,5.7633008111...|       0.0|
| Ah je n ai jamai...|Flaubert|  0.0|[1.0,1.8936641263...|       0.0|
| Ah les affiches ...|Flaubert|  0.0|[1.0,3.2910625617...|       0.0|
| Ah pardon monsie...|Flaubert|  0.0|[1.0,1.2074475252...|       0.0|
| Ah pas encore re...|Flaubert|  0.0|[1.0,3.6320289396...|       0.0|
| Ajoute que les v...|  Balzec|  1.0|           [0.0,1.0]|       1.0|
|

In [65]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [66]:
accuracy = evaluator.evaluate(predictions)

23/04/28 22:40:48 WARN DAGScheduler: Broadcasting large task binary with size 1238.2 KiB


                                                                                

In [67]:
accuracy

1.0