In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import nltk
import string

In [None]:
#create a SQL connection to our SQLite database
con = sqlite3.connect('../input/amazon-customers-data/database.sqlite')

In [None]:
type(con)

**Reading data from Sqlite database**

In [None]:
pd.read_sql_query("SELECT * FROM Reviews", con)

In [None]:
pd.read_sql_query("SELECT * FROM Reviews LIMIT 3", con)

**Loading the data using paandas**

In [None]:
df=pd.read_csv("../input/amazon-customers-data/Reviews.csv")
df.head()

In [None]:
df.shape

# Perform Sentiment Analysis on Data

In [None]:
from textblob import TextBlob

In [None]:
text=df['Summary'][0]
text

In [None]:
TextBlob(text).sentiment.polarity

In [None]:
polarity=[]

for i in df['Summary']:
    try:
        polarity.append(TextBlob(i).sentiment.polarity)
    except:
        polarity.append(0)

In [None]:
len(polarity)

In [None]:
data=df.copy()

In [None]:
data['Polarity']=polarity

In [None]:
data.head()

# Perform EDA for the Positive Sentences

In [None]:
data_positive=data[data['Polarity']>0]

In [None]:
data_positive.shape

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
stopwards=set(STOPWORDS)

In [None]:
data_positive.head()

In [None]:
total_text= (' '.join(data_positive['Summary']))

In [None]:
len(total_text)

In [None]:
total_text[0:10000]

In [None]:
import re
total_text=re.sub('[^a-zA-Z]',' ',total_text)

In [None]:
total_text[0:20000]

In [None]:
## remove extra spaces
total_text=re.sub(' +',' ',total_text)

In [None]:
total_text[0:10000]

In [None]:
len(total_text)

In [None]:
wordcloud = WordCloud(width = 1000, height = 500).generate(total_text)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud)
plt.axis('off')

# Perform EDA for the Negetive sentences

In [None]:
data_negative = data[data['Polarity']<0]
data_negative.shape

In [None]:
data_negative.head()

In [None]:
total_negative= (' '.join(data_negative['Summary']))

In [None]:
total_negative

In [None]:
total_negative=re.sub('[^a-zA-Z]',' ',total_negative)

In [None]:
len(total_negative)

In [None]:
total_negative

In [None]:
total_negative=re.sub(' +',' ',total_negative)

In [None]:
len(total_negative)

In [None]:
wordcloud = WordCloud(width = 1000, height = 500).generate(total_negative)
plt.figure(figsize=(15,5))
plt.imshow(wordcloud)
plt.axis('off')

# Analyse to what User of Amazon can recommend more product

In [None]:
df.head()

In [None]:
df['UserId'].nunique()

In [None]:
raw=df.groupby(['UserId']).agg({'Summary':'count', 'Text':'count','Score':'mean','ProductId':'count'}).sort_values(by='Text',ascending=False)
raw

In [None]:
raw.columns=['Number_of_summaries','num_text','Avg_score','Number_of_products_purchased']
raw

In [None]:
user_10=raw.index[0:10]
number_10=raw['Number_of_products_purchased'][0:10]

plt.bar(user_10, number_10, label='java developer')
plt.xlabel('User_Id')
plt.ylabel('Number of Products Purchased')
plt.xticks(rotation='vertical')

**These are the Top 10 Users so we can recommend more & more Prodcuts to these Usser Id as there will be a high probability that these person are going to be buy more.**

In [None]:
## picking a random sample
final=df.sample(n=2000)

In [None]:
final=df[0:2000]

### Checking missing values in dataset

In [None]:
final.isna().sum()

### Removing the Duplicates if any

In [None]:
final.duplicated().sum()

# Analyse Length of Comments whether Customers are going to give Lengthy comments or short one

In [None]:
final.head()

In [None]:
len(final['Text'][0].split(' '))

In [None]:
final['Text'][0]

In [None]:
def calc_len(text):
    return (len(text.split(' ')))

In [None]:
final['Text_length']=final['Text'].apply(calc_len)

In [None]:
import plotly.express as px

In [None]:
px.box(final, y="Text_length")

**Seems to have Almost 50 percent users are going to give their Feedback limited to 50 words whereas there are only few users who are going give Lengthy Feedbacks.**

# Analyzing Score

In [None]:
sns.countplot(final['Score'], palette="plasma")

# Text Pre-Processsing

In [None]:
final.head()

In [None]:
final['Text'] =final['Text'].str.lower()
final.head(10)

In [None]:
final['Text'][164]

In [None]:
re.sub('[^a-zA-Z]',' ',final['Text'][164])

**Drawback of this re.sub in this use-case is, it will remove some numerical data too & may be that numerical values matters alot.
thats way, I am going to create my own logic over here,that will remove all the special character.**

### Logic to remove punctuations or all the special characters

In [None]:
# define punctuation
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

data= final['Text'][164]

# remove punctuation from the string
no_punct = ""
for char in data:
    if char not in punctuations:
        no_punct = no_punct + char

# display the unpunctuated string
no_punct

### Create function to remove punctuations in your review

In [None]:
def remove_punc(review):
    import string
    punctuations =string.punctuation
    # remove punctuation from the string
    no_punct = ""
    for char in review:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

In [None]:
final['Text'] =final['Text'].apply(remove_punc)

In [None]:
final.head()

In [None]:
final['Text'][164]

### Removal of Stopwords

In [None]:
import re
from nltk.corpus import stopwords

In [None]:
review='seriously this product was as tasteless as they come there are much better tasting products out there but at 100 calories its better than a special k bar or cookie snack pack you just have to season it or combine it with something else to share the flavor'

In [None]:
re=[word for word in review.split(' ') if word not in set(stopwords.words('english'))]
str=''
for wd in re:
    str=str+wd
    str=str+' '
str

### Using join to convert list into string

In [None]:
re=[word for word in review.split(' ') if word not in set(stopwords.words('english'))]
' '.join(re)

### Let's find is there any hyper link in the data

In [None]:
final.head()

In [None]:
final['Text'].str.contains('http').sum()

In [None]:
final['Text'].str.contains('http?').sum()

In [None]:
pd.set_option('display.max_rows',2000)
final['Text'].str.contains('http?')

**We will observe we have some kind of URLs over here in my data that is definitely a kind of Dirtines in data, so we have to clean this data & make ready data for the analysis purpose.**

In [None]:
review=final['Text'][21]
review

### Removal of urls

In [None]:
import re

In [None]:
url_pattern=re.compile(r'href|http.\w+')
url_pattern.sub(r'',review)

In [None]:
def remove_urls(review):
    url_pattern = re.compile(r'href|http.\w+')
    return url_pattern.sub(r'', review)

In [None]:
final['Text'] = final['Text'].apply(remove_urls)

In [None]:
final.head()

In [None]:
final['Text'].str.contains('http').sum()

In [None]:
final['Text'][34]

**We will see we have lots of br in my data, let me remove wherever i have br**

In [None]:
final['Text'][34].replace('br','')

In [None]:
for i in range(len(final['Text'])):
    final['Text'][i]=final['Text'][i].replace('br','')

In [None]:
data2=final.copy()

In [None]:
data2['Text'][34]

In [None]:
data2.shape

In [None]:
data2.dtypes

In [None]:
stopwords = set(STOPWORDS) 

In [None]:
data2.head()

In [None]:
comment_words = '' 
for val in data2['Text']:
    # typecaste each val to string
    
    # split the value 
    tokens = val.split() 
    
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
    comment_words=comment_words+ " ".join(tokens)+" "

In [None]:
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 

In [None]:
plt.figure(figsize = (10, 10)) 
plt.imshow(wordcloud) 
plt.axis("off") 

# Thanks to all