IMPORTING REQUIRED LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import string
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df  = pd.read_csv('../input/amazon-alexa-reviews/amazon_alexa.tsv',sep='\t') # for kaggle
# df  = pd.read_csv('amazon_alexa.tsv',sep='\t') # for local storage
df.head()

PLOTS

In [None]:
fig = plt.figure(figsize=(7, 4))

d = df['feedback'].value_counts()

x = d.keys().to_list()
y = d.values.tolist()
sns.barplot(x=x, y=y)

In [None]:
# Plot the graph of Ratings distibution.
fig = plt.figure(figsize=(7, 4))
sns.histplot(data=df, x='rating', bins=5, kde=True, binrange=(0, 5))

TEXT ANALYSIS STARTS HERE

In [None]:
review_text = df.drop(['rating','date','variation','feedback'],axis=1)
review_text

LOWERCASING

In [None]:
df['review_lower'] = review_text.apply(lambda x: x.astype(str).str.lower())
df['review_lower']

REMOVAL OF PUNCTUATION

In [None]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
df['review_nopunc']= df['review_lower'].apply(lambda x: remove_punctuation(x))

df['review_nopunc']

REMOVAL OF EMOJIS AND EMOTICONS

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df['review_noemoji'] = df['review_nopunc'].apply(lambda x: remove_emoji(x))
df['review_noemoji']

In [None]:
EMOTICONS=[':-<',':->']
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

df['review_noemoticons'] = df['review_noemoji'].apply(lambda x: remove_emoticons(x))

df['review_noemoticons']

TOKENIZATION

In [None]:
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
df['msg_tokenized']= df['review_noemoticons'].apply(lambda x: tokenization(x))

df['msg_tokenized']

REMOVAL OF STOP WORDS

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
", ".join(stopwords.words('english'))

In [None]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    no_stop = " ".join([word for word in str(text).split() if word not in STOPWORDS])
    return no_stop

df["text_wo_stop"] = df["msg_tokenized"].apply(lambda x: remove_stopwords(x))
df["text_wo_stop"]

STEMMING

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_stemmed"] = df["text_wo_stop"].apply(lambda x: stem_words(x))
df["text_stemmed"]

LEMMATIZATION

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["text_lemmatized"] = df["text_wo_stop"].apply(lambda x: lemmatize_words(x))
df["text_lemmatized"]

In [None]:
df.head()

BAG OF WORDS VECTORIZER

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()
data_bow = bow.fit_transform(df['verified_reviews'])
print("n_samples: %d, n_features: %d" % data_bow.shape)

In [None]:
print(bow.get_feature_names())

In [None]:
print(data_bow.toarray())

TF-IDF VECTORIZER

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
data_tf = tf_idf.fit_transform(df['verified_reviews'])
print("n_samples: %d, n_features: %d" % data_tf.shape)

In [None]:
print(tf_idf.get_feature_names())

In [None]:
print(data_tf.toarray())