In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix

import random
import string
import matplotlib.pyplot as plt
import seaborn as sns

# for suppressing warnings
import warnings
warnings.filterwarnings("ignore")

### Reading the Dataset

Reading the whole dataset and process of pre-processing was too much for computation, so taking a subset of the dataset


In [None]:
bgg_reviews = pd.read_csv("/kaggle/input/boardgamegeek-reviews/bgg-15m-reviews.csv", skiprows=lambda i: i>0 and random.random() > 0.01, index_col=0) 
# bgg_reviews = pd.read_csv("BoardGameGeekReviews/bgg-15m-reviews.csv",
#                           skiprows=lambda i: i>0 and random.random() > 0.01, index_col=0) 
bgg_reviews.head()

### Finding missing/NaN values and Dropping redundant rows and columns

In [None]:
bgg_reviews.dropna(how='any', subset=['comment'], inplace=True)
bgg_reviews.drop(['user', 'ID', 'name'], axis=1, inplace=True)
bgg_reviews.head()

### Pre-processing Data
1. Lower-Casing all text
2. Removing Punctuation
3. Removing Stopwords

In [None]:
# lowercasing and removing punctuation
bgg_reviews['cleaned'] = bgg_reviews['comment'].str.lower().apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

# stopwords
stopwords = stopwords.words('english')
stopwords.extend(('game','play','played','players','player','people','really','board','games','one','plays','cards','would'))
stopwords[-10:]

# removing stopwords
bgg_reviews['cleaned'] = bgg_reviews['cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
bgg_reviews.head()

### Top 10 most common words post Pre-processing

In [None]:
Counter(" ".join(bgg_reviews["cleaned"]).split()).most_common(10)[:10]

### Plotting Ratings vs respective Count

In [None]:
n, bins, patches = plt.hist(bgg_reviews.rating, facecolor='salmon', alpha=0.9)

plt.xlabel('Ratings')
plt.ylabel('Count')
plt.show()

It can be observed that the data is UNBALANCED as the counts vary for each rating with 7 having the most count

### Visualisation of Top 100 most common words

In [None]:
# taking a positive review word as per rating > 8
pos_review_word = bgg_reviews.loc[bgg_reviews['rating'] > 8]

words = Counter([w for w in " ".join(pos_review_word['cleaned']).split()])
wc = WordCloud(width=400, height=350).generate_from_frequencies(dict(words.most_common(100)))
plt.figure(figsize=(15,10))
plt.imshow(wc, interpolation='bilinear')
plt.title('Positive Review Words', fontsize=20)
plt.axis('off');
plt.show()

In [None]:
# taking a negative review word as per rating < 3
neg_review_word = bgg_reviews.loc[bgg_reviews['rating'] < 3]

words = Counter([w for w in " ".join(neg_review_word['cleaned']).split()])
wc = WordCloud(width=400, height=350).generate_from_frequencies(dict(words.most_common(100)))
plt.figure(figsize=(15,10))
plt.imshow(wc, interpolation='bilinear')
plt.title('Negative Review Words', fontsize=20)
plt.axis('off');
plt.show()

### Splitting the Dataset into 80/20 Train/Test
Using CountVectorizer as it breaks up the text into a matrix with each word (called "token" in NLP) being the column of the matrix and the value being the count of occurences
Also, for easier computation rounding-off the ratings

In [None]:
bgg_reviews_train, bgg_reviews_test = train_test_split(bgg_reviews, test_size=0.2, random_state=40)
count_vectorizer = CountVectorizer()

train_review = count_vectorizer.fit_transform(bgg_reviews_train['comment'])
train_tag = [round(r) for r in bgg_reviews_train['rating']]
test_review = count_vectorizer.transform(bgg_reviews_test['comment'])
test_tag = [round(r) for r in bgg_reviews_test['rating']]

### Trying different Classifiers
#### Multinomial Naive Bayes
Using this classifier as we have multiple classes to generalize the reviews

In [None]:
mnb = MultinomialNB(alpha=1)
mnb.fit(train_review, train_tag)

score = mnb.score(test_review, test_tag)
print("Score for Multinomial Naive Bayes:", score)

#### Support Vector Machine
Using SVM as it uses a subset of training points in the decision function (called support vectors), which makes it also memory efficient

In [None]:
support_vector = svm.SVC()
support_vector.fit(train_review, train_tag)

score = support_vector.score(test_review, test_tag)
print("Score for Support Vector Machine:", score)

#### K-Nearest Neighbour

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_review, train_tag)

score = knn.score(test_review, test_tag)
print("Score for K-Nearest Neighbour:", score)

#### Multinomial Naive Bayes provides us with the best score with SVM being a very close second
For now, we will move forward with MNB and work towards Hyperparameter Tuning using Cross-Validation

In [None]:
alpha_arr = list(np.arange(1,30,5))
scores_arr=[]

for a in alpha_arr:
    nb = MultinomialNB(alpha=a)
    cv_scores = cross_val_score(nb, train_review, train_tag, cv=5, scoring='accuracy')
    scores_arr.append(cv_scores.mean())
    print("Alpha value and respective score:", a, cv_scores.mean())

It can be observed that Score is maximum when alpha equals 1, the score for which we have already computed

#### Visualising Predictions via Confusion Matrix

In [None]:
predictions = mnb.predict(test_review)
cmatrix = confusion_matrix(test_tag, predictions)

sns.heatmap(cmatrix, annot=True, fmt='d')

### Testing Reviews

In [None]:
test_review = "good game"
print("Rating for the Review:", mnb.predict(count_vectorizer.transform([test_review])))
print("Prediction Probability:\n", mnb.predict_proba(count_vectorizer.transform([test_review])))