In [22]:
import spacy
import pandas as pd
import nltk

In [23]:
#Load Spacy model 
nlp= spacy.load("en_core_web_sm")

In [24]:
#creating DataFrame by loading DataSet and selecting the relevant column for data cleaning
df_amazon = pd.read_csv("amazon_product_reviews.csv", sep=',', low_memory=False)
df_amazon_reviews_column = df_amazon['reviews.text'].head()
print(df_amazon_reviews_column)

0    This product so far has not disappointed. My c...
1    great for beginner or experienced person. Boug...
2    Inexpensive tablet for him to use and learn on...
3    I've had my Fire HD 8 two weeks now and I love...
4    I bought this for my grand daughter when she c...
Name: reviews.text, dtype: object


In [25]:
#converting everything to lower case
a_row_selection = df_amazon_reviews_column.loc[:] #selecting all rows 
b_lower_case = a_row_selection.str.lower() #converting everything into lower case
print(b_lower_case) 

0    this product so far has not disappointed. my c...
1    great for beginner or experienced person. boug...
2    inexpensive tablet for him to use and learn on...
3    i've had my fire hd 8 two weeks now and i love...
4    i bought this for my grand daughter when she c...
Name: reviews.text, dtype: object


In [26]:
#removing all punctuation and null rows
new_df = b_lower_case.dropna()
y= df_amazon["reviews.text"].str.replace(r'[\.\,\?\!\$\(\)\/"\&\-]',"",regex=True)
clean_df= y.str.lower()
print(clean_df)

0        this product so far has not disappointed my ch...
1        great for beginner or experienced person bough...
2        inexpensive tablet for him to use and learn on...
3        i've had my fire hd 8 two weeks now and i love...
4        i bought this for my grand daughter when she c...
                               ...                        
34655    this is not appreciably faster than any other ...
34656    amazon should include this charger with the ki...
34657    love my kindle fire but i am really disappoint...
34658    i was surprised to find it did not come with a...
34659    to spite the fact that i have nothing but good...
Name: reviews.text, Length: 34660, dtype: object


In [27]:
# The code below works - Only doing it for the first 5 rows to check the code works
clean_df['tokenized']= clean_df.iloc[:5].apply(lambda x: ['"'+ token.text +'"' for token in nlp(x)])
tokenized_df=clean_df['tokenized']
print(tokenized_df)

0    ["this", "product", "so", "far", "has", "not",...
1    ["great", "for", "beginner", "or", "experience...
2    ["inexpensive", "tablet", "for", "him", "to", ...
3    ["i", "'ve", "had", "my", "fire", "hd", "8", "...
4    ["i", "bought", "this", "for", "my", "grand", ...
Name: reviews.text, dtype: object


In [28]:
#Lemmatizing 
def lemmatize_text(text):
    doc= nlp(" ".join(text))
    return [token.lemma_ for token in doc]

tokenized_df['lemmatized'] = tokenized_df.apply(lemmatize_text)
print (tokenized_df)

0             ["this", "product", "so", "far", "has", "not",...
1             ["great", "for", "beginner", "or", "experience...
2             ["inexpensive", "tablet", "for", "him", "to", ...
3             ["i", "'ve", "had", "my", "fire", "hd", "8", "...
4             ["i", "bought", "this", "for", "my", "grand", ...
lemmatized    0    [", this, ", ", product, ", ", so, ", ", ...
Name: reviews.text, dtype: object


In [29]:
#Sentiment analyses 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#using sentiment intesity analyzer function
sid= SentimentIntensityAnalyzer()

#function to get sentiment scores
def get_sentiment_scores(text):
    return sid.polarity_scores(text)

#applying sentiment analysis to the lemmatized text
tokenized_df['sentiment_scores'] = tokenized_df['lemmatized'].apply(lambda x: get_sentiment_scores(" ".join(x)))
print(tokenized_df)

0                   ["this", "product", "so", "far", "has", "not",...
1                   ["great", "for", "beginner", "or", "experience...
2                   ["inexpensive", "tablet", "for", "him", "to", ...
3                   ["i", "'ve", "had", "my", "fire", "hd", "8", "...
4                   ["i", "bought", "this", "for", "my", "grand", ...
lemmatized          0    [", this, ", ", product, ", ", so, ", ", ...
sentiment_scores    0    {'neg': 0.0, 'neu': 0.599, 'pos': 0.401, ...
Name: reviews.text, dtype: object
