In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# reading the data set from csv files
column_names = pd.read_csv('/kaggle/input/boardgamegeek-reviews/bgg-15m-reviews.csv',header = None, nrows=1)
dataframe = pd.read_csv('/kaggle/input/boardgamegeek-reviews/bgg-15m-reviews.csv', header = None, skiprows = 1)
column_list = np.concatenate(column_names.values, axis=0).tolist()
column_list[0] = 'index'
dataframe.columns = column_list
del column_names

In [None]:
print('*'*30,'Table Data','*'*30)
print(dataframe.head())
print('*'*30,'Table Data summary','*'*30)
print(dataframe.describe())

In [None]:
# drop the rows with empty rating
dataframe.dropna(subset=['rating'])

# droping columns that are not required
dataframe = dataframe[['rating', 'comment']]

In [None]:
%%time
import nltk
# download wordnet if required
# nltk.download('wordnet')
# download stopwords if required
# nltk.download('stopwords')
# loading English stop words  
stop_words = nltk.corpus.stopwords.words('english')
import string
import re

# Basic cleaning
def cleanAndTokenize(review):
    # removing punctuations
    non_punc_words = "".join([character for character in review if character not in string.punctuation])
    
    non_punc_words = non_punc_words.strip()
    
    # tokenizing reviews
    list_of_token = re.split('\W+',non_punc_words)
    
    # removing stop words
    tokens = [word for word in list_of_token if word not in stop_words]
    
    return tokens

# converting words to lower case.
dataframe['comment'] = dataframe['comment'].apply(lambda review : cleanAndTokenize(str(review).lower()))

# using nltk's wordnet lemmatizer
word_net_lemma = nltk.WordNetLemmatizer()

def lemmatize_data(token_list):
    tokens = [word_net_lemma.lemmatize(word) for word in token_list]
    return tokens

dataframe['comment'] = dataframe['comment'].apply(lambda review : lemmatize_data(review))

In [None]:
# joining the list of words to form a string for input to count vectorizer
dataframe['comment'] = dataframe['comment'].apply(lambda review : " ".join(review))

In [None]:
dataframe = dataframe[~dataframe['comment'].isin(['nan'])]

# dividing the data set into test dev and train split
train, development, test = np.split(dataframe.sample(frac=1, random_state=5), [int(.6*len(dataframe)), int(.8*len(dataframe))])

print('size of train data set: ',train.shape[0])
print(train.head())
print('*'*100)
print('size of development data set: ',development.shape[0])
print(development.head())
print('*'*100)
print('size of test data set: ',test.shape[0])
print(test.head())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

alpha_arr = list(range(0,10))

vectorizer_obj = CountVectorizer()#analyzer='word', ngram_range=(2, 2))
count_vector_obj = vectorizer_obj.fit_transform(train['comment'].apply(lambda x: np.str_(x)))

class_data = pd.DataFrame(train['rating'],dtype='int')
# converting to 1-D array
class_data = np.ravel(class_data)
plot_NB = []

for alpha_in in alpha_arr:
    MNB_obj = MultinomialNB(alpha=alpha_in)
    MNB_obj.fit(count_vector_obj, class_data)
    actual_rating_arr = []
    NB_predicted_rating_arr = []
    for row in development.iterrows():
        actual_rating = int(row[1]['rating'])
        test_review = str(row[1]['comment'])
        actual_rating_arr.append(actual_rating)
        
        input_count_obj = vectorizer_obj.transform([test_review])
        
        NB_prediction = MNB_obj.predict(input_count_obj)
        NB_predicted_rating_arr.append(int(NB_prediction[0]))
    MSE = pd.DataFrame(columns = ['NB_predicted_rating_arr','actual_rating_arr'])
    MSE['actual_rating_arr'] = actual_rating_arr
    MSE['NB_predicted_rating_arr'] = NB_predicted_rating_arr
    mean_squared_error_NB = np.square(np.subtract(MSE['actual_rating_arr'],MSE['NB_predicted_rating_arr'])).mean()
    plot_NB.append(mean_squared_error_NB)

# Plot the mean v/s max values
plt.figure(figsize=(15,6))
plt.plot(alpha_arr,plot_NB,'b-o',label = 'M.S.E of each run')
plt.legend(loc='upper right')
plt.xlabel('smooting factor')
plt.ylabel('M.S.E')
plt.xticks(alpha_arr)
plt.grid()
plt.show()

In [None]:
class_data = pd.DataFrame(train['rating'],dtype='int')
# converting to 1-D array
class_data = np.ravel(class_data)

NB_predicted_rating_arr = []
actual_rating_arr = []

for row in test.iterrows():
    actual_rating = int(row[1]['rating'])
    test_review = str(row[1]['comment'])
    actual_rating_arr.append(actual_rating)

    input_count_obj = vectorizer_obj.transform([test_review])

    NB_prediction = MNB_obj.predict(input_count_obj)
    NB_predicted_rating_arr.append(int(NB_prediction[0]))

MSE = pd.DataFrame(columns = ['NB_predicted_rating_arr','actual_rating_arr'])
MSE['actual_rating_arr'] = actual_rating_arr
MSE['NB_predicted_rating_arr'] = NB_predicted_rating_arr
mean_squared_error_NB = np.square(np.subtract(MSE['actual_rating_arr'],MSE['NB_predicted_rating_arr'])).mean()
print('Naive Bayes MSE for alpha=1 : ',mean_squared_error_NB)