In [27]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("AuthorshipIdentificationWithPyspark").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1g").\
        getOrCreate()

sc = spark.sparkContext

In [82]:
sc.setLogLevel("WARN")

In [80]:
import os 
import pandas as pd 
import numpy as np
import re

# Create Dataset
def create_dataset():
    

    # base path for data 
    BASH_PATH = './data/'

    folders = [i for i in os.listdir(BASH_PATH) if not i.startswith('.') ] 
    authors, texts, books = [], [], []
    for folder in folders :
        files = os.listdir(BASH_PATH+folder)
        for file in files:
            with open(BASH_PATH+folder+"/"+file, 'r') as io_file :
                text = io_file.read()
                text = re.sub('\W+|_', ' ', text)
                texts.append(f'{text}')
                books.append(file.split(".")[0])
                authors.append(folder)
    
    pd.DataFrame({"text":texts, "author":authors, "book":books}).to_csv("data.csv", index=False)
                        
# create dataset
create_dataset()


In [83]:
df = spark.read\
    .format("csv")\
    .option("header", "true")\
    .load("data.csv")
#     .option("escapeQuotes", "true")\

df.show(n=5)

+--------------------+--------+--------+
|                text|  author|    book|
+--------------------+--------+--------+
|CHAPITRE I PHYSIO...|  Balzec|lafille2|
|Dans un somptueux...|  Balzec| elixir2|
|A UN LORD 1845 I ...|  Balzec|   chef2|
|A MADAME LA COMTE...|  Balzec|chabert3|
| I Pendant un dem...|Flaubert|uncoeur3|
+--------------------+--------+--------+
only showing top 5 rows



In [85]:
df.select("author").show(n=3)

+------+
|author|
+------+
|Balzec|
|Balzec|
|Balzec|
+------+
only showing top 3 rows



In [87]:
df.columns

['text', 'author', 'book']

In [88]:
df = df.select("text", "author")

In [90]:
df.show(n=3)

+--------------------+------+
|                text|author|
+--------------------+------+
|CHAPITRE I PHYSIO...|Balzec|
|Dans un somptueux...|Balzec|
|A UN LORD 1845 I ...|Balzec|
+--------------------+------+
only showing top 3 rows



In [91]:
df.groupBy("author").count().show()

[Stage 43:>                                                         (0 + 1) / 1]

+--------+-----+
|  author|count|
+--------+-----+
|  Balzec|    4|
|Flaubert|    5|
+--------+-----+



                                                                                

In [38]:
import pyspark.ml.feature

In [96]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, IDF, CountVectorizer
from pyspark.ml.feature import StringIndexer 

In [131]:
tokenizer = Tokenizer(inputCol="text", outputCol="mytokens")
stopWordsRemover = StopWordsRemover(inputCol="mytokens", outputCol="filtered_tokens")
vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="vectorizedFeatures")

In [100]:
labelEncoder = StringIndexer(inputCol="author", outputCol="label").fit(df)

In [140]:
df = labelEncoder.transform(df)

In [142]:
(trainDF, testDF) = df.randomSplit((0.6, 0.3), seed=42)

In [143]:
trainDF.show() 

+--------------------+--------+-----+
|                text|  author|label|
+--------------------+--------+-----+
| I Pendant un dem...|Flaubert|  0.0|
|A MADAME LA COMTE...|  Balzec|  1.0|
|CHAPITRE I Comme ...|Flaubert|  0.0|
|Dans un somptueux...|  Balzec|  1.0|
|Gustave Flaubert ...|Flaubert|  0.0|
+--------------------+--------+-----+



In [145]:
testDF.show()

+--------------------+--------+-----+
|                text|  author|label|
+--------------------+--------+-----+
|A UN LORD 1845 I ...|  Balzec|  1.0|
|CHAPITRE I PHYSIO...|  Balzec|  1.0|
|Flaubert L Educat...|Flaubert|  0.0|
|MADAME BOVARY GUS...|Flaubert|  0.0|
+--------------------+--------+-----+



In [146]:
from pyspark.ml.classification import LogisticRegression

In [147]:
lr = LogisticRegression(featuresCol="vectorizedFeatures", labelCol="label")

### Building Pipeline 

In [148]:
from pyspark.ml import Pipeline

In [149]:
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, vectorizer, idf, lr])

In [150]:
pipeline

Pipeline_76305083ee1c

In [151]:
pipeline.stages

Param(parent='Pipeline_76305083ee1c', name='stages', doc='a list of pipeline stages')

In [161]:
lr_model = pipeline.fit(trainDF)

In [162]:
lr_model

PipelineModel_3c1d1ce201ee

In [163]:
predictions = lr_model.transform(testDF) 

In [164]:
predictions.columns

['text',
 'author',
 'label',
 'mytokens',
 'filtered_tokens',
 'rawFeatures',
 'vectorizedFeatures',
 'rawPrediction',
 'probability',
 'prediction']

In [165]:
predictions.select("text", "author", "probability", "prediction").show()

+--------------------+--------+--------------------+----------+
|                text|  author|         probability|prediction|
+--------------------+--------+--------------------+----------+
|A UN LORD 1845 I ...|  Balzec|[2.24095997696129...|       1.0|
|CHAPITRE I PHYSIO...|  Balzec|[2.66802909851786...|       1.0|
|Flaubert L Educat...|Flaubert|[0.99999997701150...|       0.0|
|MADAME BOVARY GUS...|Flaubert|[0.99999993819271...|       0.0|
+--------------------+--------+--------------------+----------+

