In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
#Get label encoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
#Import dataset
useData=pd.read_excel('AdvancedBDAnalytics_2020_S1_A2_Data.xlsx')

In [None]:
##Function to check the balance of Helpfulness_Label
#Returns number and percentage of each Label
def check_help_balance(dataframe):
    print('Number of HELPFUL: ', len(dataframe[dataframe['Helpfulness_Label'] == 'HELPFUL']))
    print('Number of UNHELPFUL: ', len(dataframe[dataframe['Helpfulness_Label'] == 'UNHELPFUL']))
    print('The ratio of the dataset is (HELPFUL / UNHELPFUL): %.2f%% / %.2f%%' % 
          ((round((len(dataframe[dataframe['Helpfulness_Label'] == 'HELPFUL']) / len(dataframe)) * 100 , 2)), 
          (round((len(dataframe[dataframe['Helpfulness_Label'] == 'UNHELPFUL']) / len(dataframe)) * 100 , 2))))
    print("")

##Function to check the balance of Total_Reads
#Returns number and percentage of each Label
def check_read_balance(dataframe):
    print('Number of 0 reads: ', len(dataframe[dataframe['Total_Reads'] == 0]))
    print('Number of greater-than 0 reads: ', len(dataframe[dataframe['Total_Reads'] > 0]))
    print('The ratio of the dataset is (zero / greater-than zero): %.2f%% / %.2f%%' % 
          ((round((len(dataframe[dataframe['Total_Reads'] == 0]) / len(dataframe)) * 100 , 2)), 
          (round((len(dataframe[dataframe['Total_Reads'] > 0]) / len(dataframe)) * 100 , 2))))
    print("")


In [None]:
##Data Exploration
#Check Dimensions of data
print('\033[34m' + '\033[1m' + '\033[4m' + "The shape of the inital dataset is (Rows, Columns):\n" + '\033[0m' \
      + str(useData.shape))
print("")

#Check data types
print('\033[34m' + '\033[1m' + '\033[4m' + "The data types of the inital dataset are:\n" + '\033[0m' + \
     str(useData.dtypes))
print("")

#Check for Null Values
null_data = useData[useData.columns[useData.isnull().any()]].isnull().sum()
print('\033[34m' + '\033[1m' + '\033[4m' + "The columns and number of rows with null values in dataset:\n" + '\033[0m' + \
     str(list(useData[useData.columns[useData.isnull().any()]].columns.values)) + '\n' + str(list(null_data)))

#Check for duplicates
data_duplicates = useData[useData.duplicated('Review_Text') == True]
print('\033[34m' + '\033[1m' + '\033[4m' + "\nThe number of duplicate Review_Texts in dataset:\n" + '\033[0m' + \
     str(data_duplicates.shape[0]))

#Check the balance of Class Targert
print('\033[34m' + '\033[1m' + '\033[4m' + '\nBalance of classification target (Helpfulness_Label)' + '\033[0m')
check_help_balance(useData)

#Check the balance of Prediction Targert
print('\033[34m' + '\033[1m' + '\033[4m' + '\nBalance of prediction target (Total_Reads)' + '\033[0m')
check_read_balance(useData)

In [None]:
#Explore noise in target label for prediction (Total_Reads)
import matplotlib.pyplot as plt
print('\033[34m' + '\033[1m' + '\033[4m' + '\nNoise of prediction target (Total_Reads)' + '\033[0m')

plt.plot(range(0, len(useData)), useData['Total_Reads'], linewidth=2, linestyle="-", c="b")
plt.plot([0,len(useData)], [10,10], c="r")
plt.xlabel('Row Number')
plt.ylabel('Total Reads')

plt.show() 
plt.close()
print('\033[1m' + 'Shape of Total_Reads >10 (Rows, Columns):' + '\033[0m' + \
      str(useData[useData['Total_Reads'] > 10].shape))
print('\033[1m' + 'Shape of Total_Reads <=10 (Rows, Columns):' + '\033[0m' + \
      str(useData[useData['Total_Reads'] <= 10].shape))

#Initialise Table
fig = plt.figure(dpi=80)
ax = fig.add_subplot(1,1,1)

#Setup Table_Data
table_data=[["Percentage of Data", "Highest Value in Percentage"]]
noise_data = useData['Total_Reads']
percents = 0.9

#Add points
while percents <= 1:
    table_data.append([round(percents,2), round(noise_data.quantile(percents),2)])
    percents += 0.01

#Plot table 
table = ax.table(cellText=table_data, loc='center')
table.set_fontsize(14)
table.scale(1,4)
ax.axis('off')

In [None]:
#View correlation of balanced dataset
sns.heatmap(useData.corr(), annot=True, fmt='.1g', vmin=-1, vmax=1, center= 0, cmap='Blues', cbar=False)

In [None]:
##Data preperation
#Remove columns that are not required
useData = useData.drop(["Review_ID", "Product_ID", "User_ID"], axis = 1)

#Remove Null Entries
useData = useData.dropna()

#Drop duplicates in dataset
useData.drop_duplicates('Review_Text', keep='last', inplace=True)

#Transform Product_Category and Helpfulness_Label to numeric values (0=HELPFUL, 1=UNHELPFUL)
useData[['Helpfulness_Label']] = useData[['Helpfulness_Label']].apply(le.fit_transform)
useData[['Product_Category']] = useData[['Product_Category']].apply(le.fit_transform)

#Remove noise
useData = useData[useData['Total_Reads'] <= 4]

#Shuffle dataframe and reset indexs to match number of rows
useData = useData.sample(frac=1).reset_index(drop=True)

In [None]:
##Setup Sentimental Analysis
sentence_list = useData['Review_Text'].tolist()
summary_list = useData['Review_Summary'].tolist()
sentimental_text_list = []
sentimental_summary_list = []

#Import and initialise VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

#Append sentimental list's with compunded sentimental score
for sentence in sentence_list:
    text_score = analyser.polarity_scores(sentence)['compound']
    sentimental_text_list.append(text_score)

for summary in summary_list:
    summary_score = analyser.polarity_scores(summary)['compound']
    sentimental_summary_list.append(summary_score)
    
#Convert to dataframe 
sentimental_summary_df=pd.DataFrame(sentimental_summary_list,columns=['Sentimental_Summary_Score'])
sentimental_text_df=pd.DataFrame(sentimental_text_list,columns=['Sentimental_Text_Score'])

#Concatenate to main dataframe
sentimentalData = pd.concat([useData, sentimental_summary_df, sentimental_text_df], axis=1)

In [None]:
sentimentalData[['Review_Summary','Sentimental_Summary_Score','Review_Text','Sentimental_Text_Score']].iloc[0:10,:]

In [None]:
###Setup TF_IDF & Topic generation
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
stop_words = stopwords.words('english')
snowball_stemmer = SnowballStemmer('english')

##Bag of words
#Tokenize sentence on each row
tokenized_text=sentimentalData['Review_Text'].apply(word_tokenize)
tokenized_summary=sentimentalData['Review_Summary'].apply(word_tokenize)

#Convert tokens to lowercase
lower_text=tokenized_text.apply(lambda x: [word.lower() for word in x])
topic_text=tokenized_text.apply(lambda x: [word.lower() for word in x])
lower_summary=tokenized_summary.apply(lambda x: [word.lower() for word in x])

#Remove any stop words
filtered_text=lower_text.apply(lambda x: [word for word in x if word not in stop_words])
filtered_summary=lower_summary.apply(lambda x: [word for word in x if word not in stop_words])

#Implement stemming
filtered_stemized_text=filtered_text.apply(lambda x: [snowball_stemmer.stem(word) for word in x ])
topic_stemized_text=topic_text.apply(lambda x: [snowball_stemmer.stem(word) for word in x ])
filtered_stemized_summary=filtered_summary.apply(lambda x: [snowball_stemmer.stem(word) for word in x ])


#Remove words less than 4 chars or more than 11
filtered_length_text=filtered_stemized_text.apply(lambda x: [word for word in x if len(word)>3 and
len(word)<12])
topic_length_text=topic_stemized_text.apply(lambda x: [word for word in x if len(word)>3 and
len(word)<12])
filtered_length_summary=filtered_stemized_summary.apply(lambda x: [word for word in x if len(word)>3 and
len(word)<12])

##Convert to token counts
#Initialise count vector
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(lowercase=True, preprocessor = lambda x: x, tokenizer = lambda
x: x)

#Create term doc matrices
text_vectorized=vectorizer.fit_transform(filtered_length_text).toarray()
summary_vectorized=vectorizer.fit_transform(filtered_length_summary).toarray()
topic_vectorized=vectorizer.fit_transform(topic_length_text).toarray()

In [None]:
##Find TF_IDF
#Initialise TF-TF_IDF converter
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()

#Perform TF_IDF on term doc matrices
TFIDF_text = tfidfconverter.fit_transform(text_vectorized).toarray()
TFIDF_summary = tfidfconverter.fit_transform(summary_vectorized).toarray()

In [None]:
print(TFIDF_text.shape)
print(TFIDF_summary.shape)

In [None]:
##Implement SVD
#Create titles for each SVD column
n_components = 3
i=1
column_names = []
while i <= n_components:
    cName = 'SVD Text' + str(i)
    column_names.append(cName)
    i += 1
    
i=1
summary_column_names = []
while i <= n_components:
    cName = 'SVD Summary' + str(i)
    summary_column_names.append(cName)
    i += 1

#Initialise SVD model
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=n_components, algorithm='randomized', n_iter=100)

#Perform dimension reduction
svd_text_vectorized=svd_model.fit_transform(TFIDF_text)
svd_summary_vectorized=svd_model.fit_transform(TFIDF_summary)

#Convert array to a dataframe
svd_text_df = pd.DataFrame(svd_text_vectorized, columns=column_names)
svd_summary_df = pd.DataFrame(svd_summary_vectorized, columns=summary_column_names)

#Add SVD to main dataframe
data_SVD= pd.concat([sentimentalData, svd_text_df, svd_summary_df], axis=1)

In [None]:
###Topic Modelling
#Initialise LDA model
from sklearn.decomposition import LatentDirichletAllocation
LDA_text = LatentDirichletAllocation(n_components=5, random_state=42)

##Fit term doc matrices into LDA
LDA_text.fit(topic_vectorized)

#Retrieve topic percentages from each row
text_topic_values = LDA_text.transform(topic_vectorized)

#Create new column and add topic number with highest percentage
data_topics = data_SVD
data_topics['Text_Topic'] = text_topic_values.argmax(axis=1)

for i,topic in enumerate(LDA_text.components_):
    print('Top 10 words for Text Topic #%s:' % (i+1))
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

In [None]:
def compare_clusters(cMethod, data):
    ##Evaluate how many clusters to use
    from sklearn.cluster import KMeans
    from sklearn.cluster import DBSCAN
    import matplotlib.pyplot as plt
    import warnings
    #Remove Review Summary and Review_Text (No longer required)
    data_clustered = data.drop(["Review_Summary", "Review_Text"], axis = 1)

    #Initialise cluster validation df
    cluster_val = pd.DataFrame(columns = ["Davies Bouldin Score", "Silhouette Score", "Calinski Harabaz Score"])

    #Loop through 2-10 clusters
    for k in range(2, 11):
        #Initialise clusters with n_clusters=loop value
        clust = cMethod(n_clusters=k, max_iter=10000)

        #Fit model and assign labels to an array
        My_clustering=clust.fit(data_clustered)
        labels = My_clustering.labels_

        #Ignore warnings due to Davies Bouldin Score producing annoying runtime warning
        warnings.filterwarnings("ignore")

        #Setup score gathering metrics
        from sklearn.metrics import davies_bouldin_score
        from sklearn.metrics import silhouette_score
        from sklearn import metrics
        from sklearn.metrics import pairwise_distances

        #Set each score into array having 1 tuple for the three scores
        scores = [(round(davies_bouldin_score(data_clustered, labels), 3), 
                        round(silhouette_score(data_clustered, labels), 3),  
                        round(metrics.calinski_harabaz_score(data_clustered, labels), 3))]

        #Initilise and fill iteration cluster df with the scores
        cluster_iter = pd.DataFrame(scores, columns = ["Davies Bouldin Score", "Silhouette Score", "Calinski Harabaz Score"])

        #Append to main df
        cluster_val = cluster_val.append(cluster_iter, ignore_index=True, sort = False)

    #Return warnings to normal
    warnings.filterwarnings("default")

    #Change indexs and index title to match clusters
    cluster_val = cluster_val.set_index([pd.Index([2,3,4,5,6,7,8,9,10])])
    cluster_val.index.names = ['Clusters']
    print(cluster_val)

In [None]:
#Decide how many clusters to use
from sklearn.cluster import KMeans
compare_clusters(KMeans, data_topics)

In [None]:
#Initialise cluster analysis
from sklearn.cluster import KMeans
chosen_clusters = 2
My_kmeans = KMeans(n_clusters=chosen_clusters, max_iter=10000)
    
#Remove Review Summary and Review_Text (No longer required and cannot be used in clustering)
data_clustered = data_topics.drop(["Review_Summary", "Review_Text"], axis = 1)

#Fit model
My_clustering=My_kmeans.fit(data_clustered)
labels = My_clustering.labels_

#Set labels column
data_clustered['cluster_number']=My_clustering.labels_

In [None]:
##Plot clusters
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

#Create blobs
data = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=1, random_state=1)
points = data[0]
y_km = My_clustering.fit_predict(points)
centres = My_clustering.cluster_centers_

#Plot Cluster 0 (Red)
plt.scatter(points[y_km ==0,0], points[y_km == 0,1], s=100, c='r')
plotLabel2 = plt.annotate('c0', centres[0], horizontalalignment='center',
                 verticalalignment='center',
                 size=20, weight='bold',c='w')

#Plot Cluster 1(Black)
plt.scatter(points[y_km ==1,0], points[y_km == 1,1], s=100, c='k')
plotLabel1 = plt.annotate('c1', centres[1], horizontalalignment='center',
                 verticalalignment='center',
                 size=20, weight='bold',c='w')

In [None]:
#Send prepared data to excel
data_clustered.to_excel("BOYCE_Data_Preperation.xlsx", index=False)