## Importing Data

In [None]:
import pandas as pd 
import random

# data = pd.read_csv("/kaggle/input/boardgamegeek-reviews/bgg-13m-reviews.csv") 
subset_percent = 0.01 # for running algo on this percent of data (used to reduce execution time)
data = pd.read_csv("/kaggle/input/boardgamegeek-reviews/bgg-13m-reviews.csv", skiprows=lambda i: i>0 and random.random() > subset_percent, header=0) 
data.head()


## Finding count of missing values

In [None]:
data.isnull().sum()

### Dropping missing values and extra columns

In [None]:
data.dropna(how='any', subset=['comment'], inplace=True)
data.drop(['user', 'ID', 'name'], axis=1, inplace=True)
data.head()

# Preprocessing data

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)


vectorizer = CountVectorizer()
# Didn't used stopwords or stemming as the reviews are from other languages as well.
# Accuracy after removing stopwords and using stemmer: ~0.30
# Accuracy with not removing stopwords nor using stemmer: ~0.32


train_features = vectorizer.fit_transform(data_train['comment'])
test_features = vectorizer.transform(data_test['comment'])

test_label = [round(r) for r in data_test['rating']]
train_label = [round(r) for r in data_train['rating']]


# Below code was used to manipulate number of classes.Got good accuracy when the 
# number of classes were only 2 (positive and negative). But since it was not the goal, it was removed

# for i in range(len(test_label)):
#     if test_label[i] > 5:
#         test_label[i] = 1
#     else:
#         test_label[i] = 0

# for i in range(len(train_label)):
#     if train_label[i] > 5:
#         train_label[i] = 1
#     else:
#         train_label[i] = 0

# Finding the right model

In [None]:
algos = ["MulitnomialNB", "KNN", "MLP", "SVM"]
algo_scores = []

### Using Multinomial Naive Bayes
Naive Bayes works best when working with text data. Hence, this is my first choice. There are different types of Naive Bayes classifiers, but Multinomial Naive Bayes is to be used when there are multiple classes. Our dataset has 11 classes (0 - 10) hence it is most appropriate.

In [None]:
nb = MultinomialNB(alpha=1)
nb.fit(train_features, train_label)

score = nb.score(test_features, test_label)

algo_scores.append(score)
print(score)

### Using KNN Algorithm
KNN, K nearest neighbors, algorithm is a simple algorithms which classifies based on it's neighbors. 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(train_features, train_label)
knnscore = neigh.score(test_features, test_label)
algo_scores.append(knnscore)
print(knnscore)


### Using MultiLayer Perceptron Classifier
Multilayer Perceptron Classifier is a type of artificial neural network. ANNs are used to solve complex problems. Since our dataset is complex, it is worth giving a try.

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-4, random_state=1, max_iter=500) # Increased max_iter to 500 as 200 was not enough
clf.fit(train_features, train_label)
score = clf.score(test_features, test_label)

algo_scores.append(score)
print(score)


### Using SVM
SVM is one of my favorite algorithms. It works by dividing the classes with a hyperplane. It gives considerably good results in many use cases. Hence worth a try.`

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(train_features, train_label)
score = clf.score(test_features, test_label)

algo_scores.append(score)
print(score)


In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(algos,algo_scores)
plt.show()

Since, MultinomialNB gives the best result, we proceed with it for our hyper parameters tuning

In [None]:

# Below accuracy was calculated by relaxing the output class by 1.
# For ex. if the predicted value was 4 but the actual value was 3, then too it was considered correct.
# Accuracy almost doubles when calculated in this way

predictions = nb.predict(test_features)

def custom_accuracy(preds, actual):
    count = 0
    n = len(preds)
    for i in range(n):
        if abs(preds[i] - actual[i] < 1):
            count += 1
    return count / n

# Compute the error
print("accuracy s " + str(custom_accuracy(predictions, test_label)))


### Hyper parameter tuning for MultinomialNB

Finding out the appropriate alpha parameter (smoothing value)

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

alpha_range = list(np.arange(1,50,5))
len(alpha_range)

alpha_scores=[]

for a in alpha_range:
    clf = MultinomialNB(alpha=a)
    scores = cross_val_score(clf, train_features, train_label, cv=5, scoring='accuracy')
    alpha_scores.append(scores.mean())
    print(a,scores.mean())


In [None]:
import matplotlib.pyplot as plt

MSE = [1 - x for x in alpha_scores]


optimal_alpha_bnb = alpha_range[MSE.index(min(MSE))]

# plot misclassification error vs alpha
plt.plot(alpha_range, MSE)

plt.xlabel('hyperparameter alpha')
plt.ylabel('Misclassification Error')
plt.show()


The error is least for alpha = 1.

We can conclude that for alpha = 1, it gives the best result for this particular dataset.

### Visuals and parameters of the model

Confusion Matrix

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn import metrics

cm_test = confusion_matrix(test_label, predictions)

sns.heatmap(cm_test,annot=True,fmt='d')


Sample count in each class

In [None]:
nb.class_count_

## Predicting rating for a sample review

In [None]:
test_review = "with in this is it"
print(nb.predict(vectorizer.transform([test_review])))
print(nb.predict_proba(vectorizer.transform([test_review])))

## Training with 100% of data to get better results when deployed


In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

vectorizer = CountVectorizer()

train_features = vectorizer.fit_transform(data['comment'])

train_label = [round(r) for r in data['rating']]

nb = MultinomialNB(alpha=1)
nb.fit(train_features, train_label)

test_review = "This is a sample review"
print(nb.predict(vectorizer.transform([test_review])))
print(nb.predict_proba(vectorizer.transform([test_review])))


## Exporting model and vectorizer to deploy

In [None]:
import pickle
with open('/kaggle/working/model.pk', 'wb') as file:
    pickle.dump(nb, file)
with open('/kaggle/working/vect.pk', 'wb') as file:
    pickle.dump(vectorizer, file)



### Challenges faced:

* Due to large dataset, execution time was too much. Solved it by writing a piece of code to read only a part of data after shuffling it.
* The dataset contained many missing values i.e. 79.9%. All of them were removed which affected accuracy
* The dataset contained reviews in different languages. Hence didn't removed stopwords nor used stemming. Improved accuracy by 2%
* Accuracy was too low because of many classes to predict from. For ex. "Good game" review can have a rating anywhere from 7 to 10. Hence, "custom" accuracy was calculated which relaxed the rule a little (+- 1). Accuracy increased by ~30%.

## References:

* Official scikit learn documentation https://scikit-learn.org/ (Examples were referred of different classifiers)
* https://github.com/krsatyam1996/IMDB-sentiment-analysis-using-naive-bayes/blob/master/movie_review.ipynb (Used many different classifiers other than NB and many modifications with different parameters)
* https://blog.cambridgespark.com/deploying-a-machine-learning-model-to-the-web-725688b851c7 (Modified UI)


# Important links

The model is deployed at https://saeedjassani.pythonanywhere.com/

Blog post can be found at https://saeedjassani.uta.cloud/rating-predictor.html

Kaggle notebook link at https://www.kaggle.com/saeedjassani/ratings-predictor

Demo video at https://youtu.be/T4vVXt-W9vY

Please feel free to play with it. Any suggestions are welcomed at saeedjassani@gmail.com