In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
pd.set_option('display.max_rows', None)#
pd.set_option('display.max_columns', None)#


In [None]:
data = pd.read_csv('../input/insurance-reviews-france/Comments.csv')
data.head()

In [None]:
data.isna().sum() #Nan

In [None]:
data = data.dropna()

In [None]:
data.isna().sum()

We will do a global sentiment analysis without the company, the month and the year

In [None]:
data = data.drop(['Unnamed: 0' , 'Name' , 'Month' , 'Year'] , axis=1)

**CLEAN TEXT**

In [None]:
def clean_text(text):
    text = re.sub(r'@ [A-Za-z0-9]+','',text) #Removed mentions
    text = re.sub(r'#','',text) #remove the #symbol
    text = re.sub(r'RT[\s]+' , '' , text) # remove RT
    text = re.sub(r'https?:\/\/\S+' , '' , text) #Removed hyperlink
    text = "" .join([ch for ch in text if ch not in string.punctuation]) #Remove punctuation
    text = text.replace("  " , " ")
    text = " ".join([x for x in text.split(" ") if not x.isdigit()]) # Removed Numbers
    text = text.lower() # lower text
    
   
    return text

In [None]:
data['Comment'] = data.Comment.apply(str) # convert Column to string before apply function clean_text

In [None]:
data['Comment'] = data['Comment'].apply(lambda x: clean_text(x))

In [None]:
data.head()

**REMOVE STOP WORDS**

In [None]:
!pip install spacy

In [None]:
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.fr.examples import sentences #example

In [None]:
import spacy.cli
spacy.cli.download("fr_core_news_sm")

In [None]:
nlp = spacy.load('fr_core_news_sm')

In [None]:
def tokenizer(text):
    result = re.split('\W+' , text)
    return result

In [None]:
def remove_stopwords(text):
    result = [word for word in text if word not in fr_stop]
    return result

In [None]:
data['Comment'] = data['Comment'].apply(lambda x: tokenizer(x))

In [None]:
data['Comment'] = data['Comment'].apply(lambda x: remove_stopwords(x))

In [None]:
data.head()

**Lemmatization**

In [None]:
data['Comment'] = data['Comment'].apply(', '.join)

In [None]:
data['reviews'] = data['Comment'].apply(lambda x: " ".join([x.lemma_ for x in nlp(x)]))

In [None]:
data.head()

In [None]:
data = data.drop('Comment' , axis=1)

**Test Vader Sentiment & TextBlob**

**Vader sentiment**

In [None]:
!pip install vaderSentiment-fr==1.2

In [None]:
from vaderSentiment_fr.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyzer_fr = SentimentIntensityAnalyzer()
#phrase = "vilain"
#vs = analyzer_fr.polarity_scores(phrase)
#print("{:-<50} {}".format(phrase, str(vs)))

In [None]:
data['scores'] = data['reviews'].apply(lambda x: analyzer_fr.polarity_scores(x))

In [None]:
data.head()

In [None]:
data['compound'] = data['scores'].apply(lambda x: x['compound'])
data['sentiment_vader']=''
data.loc[data.compound>0,'sentiment_vader']='POSITIVE'
data.loc[data.compound==0,'sentiment_vader']='NEUTRAL'
data.loc[data.compound<0,'sentiment_vader']='NEGATIVE'

In [None]:
data.head()

In [None]:
data = data.drop(['scores' , 'compound'] , axis=1)

In [None]:
data.head()

In [None]:
colors = ['green','red','blue']

In [None]:
data.sentiment_vader.value_counts().plot(kind='bar',title="sentiment analysis" , figsize=(20,10) , color= colors)

In [None]:
print((data.sentiment_vader.value_counts() / data.shape[0])*100)

**TextBlob**

In [None]:
pip install textblob-fr

In [None]:
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())

In [None]:
sentiment_blob = []
for blob in data['reviews']:
    sb = tb(blob).sentiment[0]
    if (sb > 0):
        sentiment_blob.append('POSITIVE')
    elif (sb < 0):
        sentiment_blob.append('NEGATIVE')
    else:
        sentiment_blob.append('NEUTRAL')
        
data['sentiment_blob'] = sentiment_blob

In [None]:
data.head()

In [None]:
data.sentiment_blob.value_counts().plot(kind='bar',title="sentiment analysis" , figsize=(20,10) , color= colors)

In [None]:
print((data.sentiment_blob.value_counts() / data.shape[0])*100)

**Difference between TextBlob et Vader**

In [None]:
vader_blob_positive = data[(data['sentiment_vader'] == 'POSITIVE') & (data['sentiment_blob'] == 'POSITIVE')]
vader_blob_negative = data[(data['sentiment_vader'] == 'NEGATIVE') & (data['sentiment_blob'] == 'NEGATIVE')]
vader_blob_neutral = data[(data['sentiment_vader'] == 'NEUTRAL') & (data['sentiment_blob'] == 'NEUTRAL')]

In [None]:
pos = vader_blob_positive.shape[0]
neg = vader_blob_negative.shape[0]
neu = vader_blob_neutral.shape[0]

In [None]:
similar_vader_blob = (pos + neg + neu)/data.shape[0]
similar_vader_blob

TextBlob and Vader are 60% similar on this dataset.

we also see that Vader and textblob have roughly the same rate of negative sentiment (Vader=30.51% & Textblob=30.84%)

The difference is therefore at the level where Vader considers a neutral sentiment while TextBlob considers it positive and vice versa.

