
#  <font color='blue'> Text Preprocessing and Subject Extraction Using Pyspark </font>

# I hope you find this kernel useful
# Your <font color='red'> UPVOTES </font> would be highly appreciated

In [None]:
!pip install pyspark

In [None]:
import os
import json
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd
import os
import nltk
import re
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
import string
from pyspark.sql.functions import lit
from pyspark.sql.functions import monotonically_increasing_id 


# <font color='red'> Create Spark Session </font>

In [None]:
sparkSession = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()


<font color='black'> Load CSV to pandas Dataframe </font>

In [None]:
df = pd.read_csv('../input/insurance-reviews-france/Comments.csv')


<font color='black'> Drop  Unnamed: 0 column </font>

In [None]:
df = df.drop(['Unnamed: 0'], axis=1)



<font color='black'> Convert pandas DataFrame To Pyspark DataFrame </font>

In [None]:
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Comment", StringType(), True),
    StructField("Month", IntegerType(), True), 
    StructField("year", StringType(), True),
])

In [None]:
df_sp = sparkSession.createDataFrame(df,schema =schema )


In [None]:
df_sp.show()

# <font color='red'> Data Preprocessing </font>

<font color='black'> Drop Nan Values </font>

In [None]:
df_sp = df_sp.filter(df_sp.Comment != 'NaN')

 
 <font color='black'> Add index column </font>

In [None]:
rdd_df = df_sp.rdd.zipWithIndex()
df_sp = rdd_df.toDF()
df_sp = df_sp.withColumn('Name', df_sp['_1'].getItem("Name")).withColumn('Comment', df_sp['_1'].getItem("Comment")).withColumn('Month', df_sp['_1'].getItem("Month")).withColumn('Year', df_sp['_1'].getItem("Year")).withColumn('Index', df_sp['_2'])
df_sp = df_sp.select('Index', 'Name','Comment','Month','Year')


In [None]:
df_sp.show(5)

<font color='black'> Select the Comments feature </font>

In [None]:
comments_rdd = df_sp.select("Comment").rdd.flatMap(lambda x: x)

<font color='black'> Convert the data into lowercase. </font>

In [None]:
comments_rdd_lower = comments_rdd.map(lambda x : x.lower())


In [None]:
comments_rdd_lower.collect()


<font color='black'> Sentence tokenization </font>

In [None]:
def sentence_tokenization(x):
    return nltk.sent_tokenize(x)


In [None]:
comments_rdd_tok = comments_rdd_lower.map(sentence_tokenization)


In [None]:
comments_rdd_tok.collect()


<font color='black'> Word tokenization </font>

In [None]:
def word_TokenizeFunctSentence(x):
    sentence_splitted = []
    for line in x:
        splitted = []
        for word in re.sub("\W"," ", line).split():
            splitted.append(word)
        sentence_splitted.append(splitted)
    return sentence_splitted
comments_rdd_word_tok_sentence = comments_rdd_tok.map(word_TokenizeFunctSentence)


In [None]:
comments_rdd_word_tok_sentence.collect()


<font color='black'> set of Spacy's default stop words and delete negation words </font>

In [None]:
stop_words=set(STOP_WORDS)

deselect_stop_words = ['n\'', 'ne','pas','plus','personne','aucun','ni','aucune','rien']
for w in deselect_stop_words:
    if w in stop_words:
        stop_words.remove(w)
    else:
        continue

In [None]:
stop_words

In [None]:
def removeStopWordsSentencesFunct(x):
    sentence_stop=[]
    for j in x:
        fil=[]
        for w in j:
            if not ((w in stop_words) or (len(w) == 1)):
                fil.append(w)
        sentence_stop.append(' '.join(fil))
    return sentence_stop

stopwordRDDSen = comments_rdd_word_tok_sentence.map(removeStopWordsSentencesFunct)


In [None]:
stopwordRDDSen.collect()


<font color='black'> Join Tokens </font>

In [None]:
def joinTokensFunct(x):
    joinedTokens_list = []
    x = " ".join(x)
    joinedTokens_list.append(re.sub("\W"," ", x))
    return joinedTokens_list
joinedTokens = stopwordRDDSen.map(joinTokensFunct)

In [None]:
joinedTokens.collect()

# <font color='red'> Subject Extraction </font>

In [None]:
my_words = ["sécurité","prix", "sociale" , "remboursement" , "dentaire", "aide" , "pack" , "optique" , "soins" ,
"enfant","hospitalisation" , "handicap" , "document" , "retraite" , "carte" , "médicament" , "lunettes" ,
"appareil" , "changement" , "accident" , "intervention","garantie","augmentation","implant", "pharmacie" ,"attente", "formule" ,
"maternité" , "cotisation", "cpam" , "diabète", "auditif",
"commercial", "opticien" , "euros" , "retard" , "contrat", "prestation", "dossier" , "chirurgie" , "résiliation" ]

In [None]:
def TopicsSentences(x):
    topics =[]        
    topic =[]

    for i in x:
        for ext in my_words:
            if (ext in i):
                topic.append(ext)
    return topic
topics = stopwordRDDSen.map(TopicsSentences)


In [None]:
topics.collect()

Add the comments after preprocessing and the topics to our Dataframe.

In [None]:
comments_after_preproc = sparkSession.createDataFrame([w for w in joinedTokens.collect()], ['comments_after_preproc'])   
rdd_df2 = comments_after_preproc.rdd.zipWithIndex()
comments_after_preproc = rdd_df2.toDF()
comments_after_preproc = comments_after_preproc.withColumn('comments_after_preproc', comments_after_preproc['_1'])
comments_after_preproc = comments_after_preproc.withColumn('Index', comments_after_preproc['_2'])
comments_after_preproc = comments_after_preproc.select('Index', 'comments_after_preproc')


Topics = sparkSession.createDataFrame(topics,schema = "array<string>")    
topics_df = Topics.rdd.zipWithIndex()
Topics = topics_df.toDF()
Topics = Topics.withColumn('Topics', Topics['_1'])
Topics = Topics.withColumn('Index', Topics['_2'])
Topics = Topics.select('Index', 'Topics')


In [None]:
df_spark4 = df_sp.join(comments_after_preproc, on=['Index']).join(Topics, on=['Index'])


In [None]:
df_spark4.show(5)