<a href="https://colab.research.google.com/github/saragamilmohamed/NLP/blob/main/Amazon%20Reviews/Polarity%20and%20%20cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_csv('/content/amazon_alexa.tsv', sep='\t')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
from textblob import TextBlob

In [None]:
df['length'] = df['verified_reviews'].astype(str).apply(len)

In [None]:
def get_polarity(text):
    textblob= TextBlob(str(text.encode('utf_8')))
    pol = textblob.sentiment.polarity
    return pol
df['polarity']=df['verified_reviews'].astype(str).apply(get_polarity)

In [None]:
df.head()

In [None]:
def get_subjectivity(text):
    textblob= TextBlob(str(text.encode('utf_8')))
    pol = textblob.sentiment.subjectivity
    return pol
df['subjectivity']=df['verified_reviews'].astype(str).apply(get_subjectivity)

In [None]:
df.head()

In [None]:
df['char_count']=df['verified_reviews'].astype(str).apply(len)

In [None]:
df['word_count']=df['verified_reviews'].astype(str).apply(lambda x: len(x.split()))

In [None]:
df['word_density']=df['char_count']/(df['word_count']+1)

In [None]:
df

In [None]:
import string
punctuation=string.punctuation

In [None]:
df['punctuation_count']=df['verified_reviews'].astype(str).apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))

In [None]:
df

In [None]:
df[['char_count','word_count','word_density','punctuation_count']].describe()

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
!python -m textblob.download_corpora

In [None]:
pos_dic = {
    'noun': ['NN', 'NNS', 'NNP', 'NNPS'],
    'pron': ['PRP', 'PRP$', 'WP', 'WP$'],
    'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    'adj': ['JJ', 'JJR', 'JJS'],
    'adv': ['RB', 'RBR', 'RBS', 'WRB']
}

def pos_check(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = tup[1]
            if ppo in pos_dic[flag]:
                cnt += 1
    except Exception as e:
        print(f"Error processing text: {x}")
        print(f"Exception: {e}")
    return cnt

df['noun_count'] = df['verified_reviews'].astype(str).apply(lambda x: pos_check(x, 'noun'))
print(df)

In [None]:
df['verified_reviews'][3]

In [None]:
df['pron_count'] = df['verified_reviews'].astype(str).apply(lambda x: pos_check(x, 'pron'))

In [None]:
df['verb_count'] = df['verified_reviews'].astype(str).apply(lambda x: pos_check(x, 'verb'))

In [None]:
df['adj_count'] = df['verified_reviews'].astype(str).apply(lambda x: pos_check(x, 'adj'))

In [None]:
df['adv_count'] = df['verified_reviews'].astype(str).apply(lambda x: pos_check(x, 'adv'))

In [None]:
df

In [None]:
def punctuation_removal(messy_str) :
  if isinstance(messy_str, str) :
      clean_list = [char for char in messy_str if char not in string.punctuation]
      clean_str = ''.join(clean_list)
      return clean_str

  else:
      return messy_str


df['verified_reviews'] = df['verified_reviews'].apply(punctuation_removal)

In [None]:
# lets make a function to remove Numbers from the reviews
import re
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ''.join(list_text_new)

df['verified_reviews'] = df['verified_reviews'].astype(str).apply(drop_numbers)

In [None]:
# lets visualize the Top 10 Reviews after Removal of Punctuations and Numbers
df['verified_reviews'].head(10)

In [None]:
# lets create a function to remove accented characters
import unicodedata # Import the unicodedata module        é

def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

# lets apply the function
df['verified_reviews'] = df.apply(lambda x: remove_accented_chars(x['verified_reviews']), axis = 1)

In [None]:

# Create a function to remove special characters
def remove_special_characters(text):
    pat = r'[^a-zA-z0-9]'
    return re.sub(pat, ' ', text)

# lets apply this function
df['verified_reviews'] = df.apply(lambda x: remove_special_characters(x['verified_reviews']), axis = 1)

In [None]:
df['verified_reviews'][:5]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (10, 4)

plt.subplot(1, 2, 1)
sns.histplot(df['polarity'])

plt.subplot(1, 2, 2)
sns.histplot(df['subjectivity'])

plt.suptitle('Distribution of Polarity and Subjectivity')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (10, 4)

plt.subplot(1, 2, 1)
sns.distplot(df['polarity'])

plt.subplot(1, 2, 2)
sns.distplot(df['subjectivity'])

plt.suptitle('Distribution of Polarity and Subjectivity')
plt.show()

In [None]:
# lets check relation between Polarity and Subjectivity

sns.scatterplot(x=df['polarity'], y=df['subjectivity'])
plt.title('Polarity vs Subjectivity')
plt.show()

In [None]:


from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(df.verified_reviews)
sum_words = words.sum(axis=0)


words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])

plt.style.use('fivethirtyeight')
color = plt.cm.ocean(np.linspace(0, 1, 20))
frequency.tail(20).plot(x='word', y='freq', kind='bar', figsize=(15, 6), color = color)
plt.title("Least Frequently Occuring Words - Top 20")
plt.show()

In [None]:
words.sum(axis=0)

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(df.verified_reviews)
sum_words = words.sum(axis=0)


words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
wordcloud = WordCloud(background_color = 'lightcyan', width = 2000, height = 2000).generate_from_frequencies(dict(words_freq))

plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.title("Vocabulary from Reviews", fontsize = 20)
plt.show()

In [None]:
words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]

In [None]:
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)

In [None]:
words_freq[1]

In [None]:
import collections

# function for making ngrams
from nltk.util import ngrams

In [None]:
text = str(df['verified_reviews'])
tokenized = text.split()

# and get a list of all the bi-grams
esBigrams = ngrams(tokenized, 2)

# get the frequency of each bigram in our corpus
esBigramFreq = collections.Counter(esBigrams)

# what are the ten most popular ngrams in this Spanish corpus?
esBigramFreq.most_common(10)

In [None]:
# Visualizing the Trigrams

text = str(df['verified_reviews'])
tokenized = text.split()

# and get a list of all the bi-grams
esTrigrams = ngrams(tokenized, 3)

# get the frequency of each bigram in our corpus
esTrigramFreq = collections.Counter(esTrigrams)

# what are the ten most popular ngrams in this Spanish corpus?
esTrigramFreq.most_common(10)