#### =====================================================================================
#### Problem statement : Flag the comments which have opposite sentiments with respect to app rating 

In [1]:
#loading libraries 
import numpy as np
import pandas as pd 

In [2]:
import warnings 

warnings.filterwarnings("ignore")

In [3]:
# Reading the collated file 
collated_data = pd.read_csv("collated_app_comments_2gud.csv")

In [4]:
collated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4308 entries, 0 to 4307
Data columns (total 29 columns):
Package Name                             4308 non-null object
App Version Code                         2931 non-null float64
App Version Name                         2931 non-null float64
Reviewer Language                        4308 non-null object
Device                                   4308 non-null object
Review Submit Date and Time              4308 non-null object
Review Submit Millis Since Epoch         4308 non-null int64
Review Last Update Date and              2520 non-null object
Week                                     2520 non-null float64
ime                                      2520 non-null object
Review Last Update Millis Since Epoch    4308 non-null int64
Star Rating                              4308 non-null int64
Review Title                             0 non-null float64
Review Text                              2068 non-null object
Developer Reply Date and Ti

In [5]:
collated_data['Month'].value_counts()

july      1216
june       906
march      719
august     572
feb        535
may        360
Name: Month, dtype: int64

In [6]:
collated_data[collated_data['Star Rating'] <= 3]['Month'].value_counts()

july      570
june      312
august    297
march     231
may       152
feb       147
Name: Month, dtype: int64

### Data Manipulation 

In [7]:
collated_data['Review Text'].fillna('Blank', inplace = True)
#Removing all symbols etc from the comments 

# removing everything except alphabets`
collated_data['clean_doc'] = collated_data['Review Text'].str.replace("[^a-zA-Z2#]", " ")

# removing short words
collated_data['clean_doc'] = collated_data['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))

# make all text lowercase
collated_data['clean_doc'] = collated_data['clean_doc'].apply(lambda x: x.lower())

In [8]:
import re

#Correcting mis splet words
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

collated_data['clean_doc'] = collated_data['clean_doc'].apply(lambda x: reduce_lengthening(x))

In [9]:
collated_data['clean_doc']

0                                   very good
1                                      so gud
2                                       wrost
3                                       blank
4       is app down showing only white screen
                        ...                  
4303                                      tky
4304                                    blank
4305                                         
4306                                very nice
4307                                    blank
Name: clean_doc, Length: 4308, dtype: object

In [None]:
# import nltk
# nltk.download('punkt')


# # Lemmatisation
# w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
# lemmatizer = nltk.stem.WordNetLemmatizer()

# def lemmatize_text(text):
#     return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

# collated_data['text_lemmatized'] = collated_data.clean_doc.apply(lemmatize_text)

In [None]:
# ! pip install pyspellchecker

In [None]:
# from spellchecker import SpellChecker

# spell = SpellChecker()

# # find those words that may be misspelled
# misspelled = spell.unknown(['gud', 'wrost', 'hapenning', 'here'])

# for word in misspelled:
#     # Get the one `most likely` answer
#     print(spell.correction(word))

#     # Get a list of `likely` options
#     print(spell.candidates(word))

In [None]:
# # Text correction 
# from pattern.en import spelling


# # correct_word = 
# spelling(word_wlf) 
# # print correct_word

In [None]:
# output = TextBlob(word_wlf).correct()
# print(output)

In [None]:
# ! pip install autocorrect 

In [None]:
# from autocorrect import Speller
# spell = Speller(lang='en')

# word_wlf = 'gud'
# a = spell(word_wlf)
# a

In [10]:
# spell correction 
from spellchecker import SpellChecker

spell = SpellChecker()

# find those words that may be misspelled
# Get the one `most likely` answer
# collated_data['clean_doc'] = np.where(collated_data['clean_doc'].str.contains(' gud'), 'good',collated_data['clean_doc'])
collated_data['clean_doc'] = collated_data['clean_doc'].apply(lambda x : spell.correction(x)) 

In [11]:
collated_data['clean_doc']

0                                   very good
1                                      so gud
2                                       worst
3                                       blank
4       is app down showing only white screen
                        ...                  
4303                                      try
4304                                    blank
4305                                        a
4306                                very nice
4307                                    blank
Name: clean_doc, Length: 4308, dtype: object

In [12]:
#replacing gud with good 
collated_data["clean_doc"] = collated_data.clean_doc.replace({'gud':'good'}, regex=True)

### Tagging Polarity to the existing comments 

In [13]:
from textblob import TextBlob #for polarity of comment 

df = collated_data.copy()

df['polarity_textblob'] = df['clean_doc'].map(lambda text: TextBlob(text).sentiment.polarity)
df['review_len'] = df['clean_doc'].astype(str).apply(len)
df['word_count'] = df['clean_doc'].apply(lambda x: len(str(x).split()))

In [14]:
df[['clean_doc','polarity_textblob']]

Unnamed: 0,clean_doc,polarity_textblob
0,very good,0.910000
1,so good,0.700000
2,worst,-1.000000
3,blank,0.000000
4,is app down showing only white screen,-0.051852
...,...,...
4303,try,0.000000
4304,blank,0.000000
4305,a,0.000000
4306,very nice,0.780000


In [None]:
#writing the polarity file using TEXT BLOB
# df.to_csv("App_comments_w_polarity.csv", index = False)

### VADER 

In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [16]:
import nltk 
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shivani.singh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [17]:
df["polarity_vader"] = df["clean_doc"].apply(lambda x : sia.polarity_scores(x))

In [18]:
for row in range(len(df)):
    df.loc[row,"overall_vader"] = df.polarity_vader.loc[row]['compound']

In [19]:
df.overall_vader

0       0.4927
1       0.4927
2      -0.6249
3       0.0000
4       0.0000
         ...  
4303    0.0000
4304    0.0000
4305    0.0000
4306    0.4754
4307    0.0000
Name: overall_vader, Length: 4308, dtype: float64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4308 entries, 0 to 4307
Data columns (total 35 columns):
Package Name                             4308 non-null object
App Version Code                         2931 non-null float64
App Version Name                         2931 non-null float64
Reviewer Language                        4308 non-null object
Device                                   4308 non-null object
Review Submit Date and Time              4308 non-null object
Review Submit Millis Since Epoch         4308 non-null int64
Review Last Update Date and              2520 non-null object
Week                                     2520 non-null float64
ime                                      2520 non-null object
Review Last Update Millis Since Epoch    4308 non-null int64
Star Rating                              4308 non-null int64
Review Title                             0 non-null float64
Review Text                              4308 non-null object
Developer Reply Date and Ti

In [21]:
df["Wrong_rating_flag"] = np.where((df["Star Rating"].isin([4,5])) & (df["overall_vader"] <= -0.1), 'neg_com',
                                   np.where((df["Star Rating"].isin([1,2,3])) & (df["overall_vader"] >= 0.25), 'pos_com','0'))

In [22]:
df["Wrong_rating_flag"].unique()

array(['0', 'pos_com', 'neg_com'], dtype=object)

In [23]:
df["False_positive_comments"] = np.where((df["Wrong_rating_flag"] == 'pos_com') & (df["Developer Reply Text"].str.contains('Sorry')),1,0)

In [24]:
# Creating new star ratig columns based on polarity 


for row in range(len(df)) :

    if ((df.loc[row,"Wrong_rating_flag"] == "pos_com") & (df.loc[row,"Star Rating"] == 1) & (df.loc[row,"False_positive_comments"] ==0)):
        df.loc[row,"New Star Rating"] = 4

    elif ((df.loc[row,"Wrong_rating_flag"] == "pos_com") & (df.loc[row,"False_positive_comments"] ==0) & ((df.loc[row,"Star Rating"] == 2) | (df.loc[row,"Star Rating"] == 3))):
        df.loc[row,"New Star Rating"] = 5

    elif ((df.loc[row,"Wrong_rating_flag"] == "neg_com") & ((df.loc[row,"Star Rating"] == 3) | (df.loc[row,"Star Rating"] == 4))):
        df.loc[row,"New Star Rating"] = 1

    elif  ((df.loc[row,"Wrong_rating_flag"] == "neg_com") & (df.loc[row,"Star Rating"] == 5 )):
        df.loc[row,"New Star Rating"] = 2

    else :
        df.loc[row,"New Star Rating"] = df.loc[row,"Star Rating"]


In [25]:
df[["Star Rating","New Star Rating"]]

Unnamed: 0,Star Rating,New Star Rating
0,4,4.0
1,3,5.0
2,1,1.0
3,4,4.0
4,2,2.0
...,...,...
4303,1,1.0
4304,1,1.0
4305,1,1.0
4306,5,5.0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4308 entries, 0 to 4307
Data columns (total 38 columns):
Package Name                             4308 non-null object
App Version Code                         2931 non-null float64
App Version Name                         2931 non-null float64
Reviewer Language                        4308 non-null object
Device                                   4308 non-null object
Review Submit Date and Time              4308 non-null object
Review Submit Millis Since Epoch         4308 non-null int64
Review Last Update Date and              2520 non-null object
Week                                     2520 non-null float64
ime                                      2520 non-null object
Review Last Update Millis Since Epoch    4308 non-null int64
Star Rating                              4308 non-null int64
Review Title                             0 non-null float64
Review Text                              4308 non-null object
Developer Reply Date and Ti

In [28]:
df['Month'].value_counts()

july      1216
june       906
march      719
august     572
feb        535
may        360
Name: Month, dtype: int64

In [27]:
df["False_positive_comments"].value_counts()

0    4265
1      43
Name: False_positive_comments, dtype: int64

In [29]:
df.to_csv("polarity_textblob_vader.csv", index = False)

In [30]:
sentence = "many issue in this app"

In [31]:
TextBlob(sentence).sentiment.polarity

0.5

In [32]:
ss = sia.polarity_scores(sentence)

for k in sorted(ss):
    print('{0}: {1}, '.format(k, ss[k]), end='')
    print()

compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 


### Stanford core nlp

In [None]:
# ! pip install pycorenlp

In [None]:
# from pycorenlp import StanfordCoreNLP

# nlp = StanfordCoreNLP()
# res = nlp.annotate("I love you. I hate him. You are nice. He is dumb",
#                    properties={
#                        'annotators': 'sentiment',
#                        'outputFormat': 'json',
#                        'timeout': 1000,
#                    })
# for s in res["sentences"]:
#     print("%d: '%s': %s %s" % (
#         s["index"],
#         " ".join([t["word"] for t in s["tokens"]]),
#         s["sentimentValue"], s["sentiment"]))