# Loading important packages

In [None]:
import pandas as pd
import numpy as np
import pickle
import re
import nltk
import string
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize 
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from pandas import DataFrame

# Data Preprocessing

In [None]:
df2 = pd.read_csv('/kaggle/input/boardgamegeek-reviews/bgg-15m-reviews.csv')

## Dropping columns which are not required.

Use once

In [None]:
del df2['Unnamed: 0']
del df2['ID']
del df2['name']
del df2['user']

## Dropping rows with NaN values.

In [None]:
df = df2.dropna()

In [None]:
df.shape

In [None]:
df['rating'].unique()

In [None]:
decimals = pd.Series([0], index=['rating'])
df = df.round(decimals)

In [None]:
df['rating'].unique()

## Rounding up the ratings for better efficieny of the model

In [None]:
def rating_enhancement(rating):
    if rating == 10.0:
        return int(10)
    elif rating >=9.5 and rating <10:
        return int(10)
    elif rating <9.5 and rating >=9:
        return int(9)
    elif rating >=8.5 and rating <9:
        return int(9)
    elif rating <8.5 and rating >=8:
        return int(8)
    elif rating >=7.5 and rating <8:
        return int(8)
    elif rating <7.5 and rating >=7:
        return int(7)
    elif rating >=6.5 and rating <7:
        return int(7)
    elif rating <6.5 and rating >=6:
        return int(6)
    elif rating >=5.5 and rating <6:
        return int(6)
    elif rating <5.5 and rating >=5:
        return int(5)
    elif rating >=4.5 and rating <5:
        return int(5)
    elif rating <4.5 and rating >=4:
        return int(4)
    elif rating >=3.5 and rating <4:
        return int(4)
    elif rating <3.5 and rating >=3:
        return int(3)
    elif rating >=2.5 and rating <3:
        return int(3)
    elif rating <2.5 and rating >=2:
        return int(2)
    elif rating >=1.5 and rating <2:
        return int(2)
    elif rating <1.5 and rating >=1:
        return int(1)

In [None]:
df['rating'] = df['rating'].apply(rating_enhancement)
df['rating'] = df['rating'].apply(rating_enhancement)

In [None]:
df = df.dropna()

In [None]:
df['rating'].unique()

In [None]:
df.shape

In [None]:
df

## Cleaning the comments

In [None]:
def clean_comments(text):
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'re", " are", text)

    text = re.sub(r"[0-9]+", ' ', text)
    text = re.sub(r"-", ' ', text)
    
    
    text = text.strip().lower()
    

    default_stop_words = set(stopwords.words('english'))
    default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'})
    stop_words = default_stop_words.union({"'m", "n't", "'d", "'re", "'s",
                                           'would','must',"'ve","'ll",'may'})

    word_list = word_tokenize(text)
    filtered_list = [w for w in word_list if not w in stop_words]
    text = ' '.join(filtered_list)
    
    text = re.sub(r"'", ' ', text)
    
   
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((i, " ") for i in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    

    text = ' '.join([w for w in text.split() if len(w)>1])

    # Replace multiple space with one space
    text = re.sub(' +', ' ', text)
    
    text = ''.join(text)

    return text

In [None]:
%%time
df['clean_comment'] = df['comment'].apply(clean_comments)

In [None]:
freq_train1 = pd.Series(' '.join(df['clean_comment']).split()).value_counts()
less_five_freq_train1 = freq_train1[(freq_train1 <10)]
print('Words occuring less than 5 are: ')
print('')
print(less_five_freq_train1)

In [None]:
%%time
df['clean_comment'] = df['clean_comment'].apply(lambda x: ' '.join(x for x in x.split() if x not in less_five_freq_train1))

In [None]:
def NormalizeWithPOS(text):
    # Lemmatization & Stemming according to POS tagging

    word_list = word_tokenize(text)
    rev = []
    lemmatizer = WordNetLemmatizer() 
    stemmer = PorterStemmer() 
    for word, tag in pos_tag(word_list):
        if tag.startswith('J'):
            w = lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('V'):
            w = lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('N'):
            w = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('R'):
            w = lemmatizer.lemmatize(word, pos='r')
        else:
            w = word
        w = stemmer.stem(w)
        rev.append(w)
    review = ' '.join(rev)
    return review

In [None]:
%%time
df['clean_comment'] = df['clean_comment'].apply(NormalizeWithPOS)

In [None]:
dataset =df

In [None]:
rating_num_set = {}
for rating in (10-dataset['rating'].unique()):
    new_comment_rating = dataset.loc[dataset['rating'] >= (rating - 0.5)]
    new_comment_rating = new_comment_rating.loc[new_comment_rating['rating'] <= (rating + 0.5)]
    new_comment_rating = new_comment_rating.sample(frac = 1).reset_index(drop = True)
    rating_num_set[rating] = new_comment_rating

for rating in rating_num_set:
    print("rating: ", rating, "rating num:",  len(rating_num_set[rating]))

In [None]:
rating_list = []
for rating in rating_num_set: 
    rating_list.append(len(rating_num_set[rating]))
plt.bar(range(len(rating_list)), rating_list)
plt.show()

In [None]:
print("A review example of dataset before cleaning:")
print(dataset.iloc[0]['comment'], end='\n\n')

print("clean_text:")
print(dataset.iloc[0]['clean_comment'], end="\n\n")

# Train and Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset['clean_comment'], dataset['rating'], test_size=0.25, random_state=42)

In [None]:
X_train.index = [x for x in range(1, len(X_train.values)+1)]
X_test.index = [x for x in range(1, len(X_test.values)+1)]
y_train.index = [x for x in range(1, len(y_train.values)+1)]
y_test.index = [x for x in range(1, len(y_test.values)+1)]

# Create Bag of Words

In [None]:
vectorizer = CountVectorizer(stop_words='english')

In [None]:
%%time

training_features = vectorizer.fit_transform(X_train)
testing_features = vectorizer.transform(X_test)

In [None]:
training_features.shape

In [None]:
testing_features.shape

# Multinomial Naive Bayes

# Model creation

In [None]:
model = MultinomialNB()

In [None]:
%%time

model.fit(training_features, y_train)

In [None]:
predict_total = model.predict(testing_features)

In [None]:
predict_total

In [None]:
y_test

# Hyperparameter Tuning

# Choosing mean squared error as my accuracy metric.

In [None]:
print(model.get_params())

In [None]:
alpha = np.linspace(0,1,100)

In [None]:
random_grid = {'alpha': alpha,
              'fit_prior': [True,False]}
print(random_grid)

In [None]:
model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [None]:
model_random.fit(training_features, y_train)

In [None]:
model_random.best_estimator_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    MSE = mean_squared_error(test_labels,predictions)
    accuracy = accuracy_score(test_labels,predictions)*100
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('MSE = {:0.2f}.'.format(MSE))
    
    return accuracy

In [None]:
base_model = MultinomialNB(alpha = 0.5)
base_model.fit(training_features, y_train)
base_accuracy = evaluate(base_model, testing_features, y_test)

In [None]:
best_random = model_random.best_estimator_
random_accuracy = evaluate(best_random, testing_features, y_test)

In [None]:
meanSquaredError1 = []
hyper_cond1 = []
alpha = np.linspace(0,1,100)
for i in alpha:
    nb_model = MultinomialNB(alpha = i)
    nb_model.fit(training_features, y_train)
    predictions = nb_model.predict(testing_features)
    MSE = mean_squared_error(y_test,predictions)
    meanSquaredError1.append(MSE)
    hyper_cond1.append(''+str(i))

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.plot(hyper_cond1, meanSquaredError1, color = 'b', label ='model')
ax.set_ylabel('MSE')
ax.set_xlabel('Random Forest Parameters')
ax.set_title('MSE for Naives with different alphas')
# ax.set_xticks(random_forest)
# plt.ylim((0,100))
plt.grid(True)
plt.legend(loc = 'upper right')
plt.show()

# From the above we can conclude that alpha value of 0 gives me the least mean squared error.

# Model post Hyperparameter tuning

In [None]:
final_model_multinomial = MultinomialNB(alpha = 0.9797979797979799)

In [None]:
%%time
final_model_multinomial.fit(training_features, y_train)

In [None]:
predict_total = final_model_multinomial.predict(testing_features)

In [None]:
mean_squared_error(y_test,predict_total)