# Preprocessing step

## Import of tools

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy import sparse

!pip install langdetect
from langdetect import detect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l[K     |▍                               | 10 kB 27.0 MB/s eta 0:00:01[K     |▊                               | 20 kB 31.0 MB/s eta 0:00:01[K     |█                               | 30 kB 19.8 MB/s eta 0:00:01[K     |█▍                              | 40 kB 13.5 MB/s eta 0:00:01[K     |█▊                              | 51 kB 11.3 MB/s eta 0:00:01[K     |██                              | 61 kB 13.1 MB/s eta 0:00:01[K     |██▍                             | 71 kB 13.3 MB/s eta 0:00:01[K     |██▊                             | 81 kB 14.4 MB/s eta 0:00:01[K     |███                             | 92 kB 13.1 MB/s eta 0:00:01[K     |███▍                            | 102 kB 14.2 MB/s eta 0:00:01[K     |███▊                            | 112 kB 14.2 MB/s eta 0:00:01[K     |████                            | 122 kB 14.2 MB/s eta 0:00:01[K     |████▍                           | 133 kB 14.2 MB/s eta 0:00:

## Import cleaned datasets

In [2]:
game_reviews = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/cleaned_game_reviews.csv', index_col=0)
game_ratings = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/cleaned_game_ratings.csv', index_col=0)

In [3]:
game_reviews.describe()

Unnamed: 0,rating,ID
count,2948623.0,2948623.0
mean,6.886088,89184.77
std,1.796896,90728.63
min,1.4013e-45,1.0
25%,6.0,6607.0
50%,7.0,42776.0
75%,8.0,163412.0
max,10.0,350992.0


## Sample 20% of the data to make the dataframe small enough to work with

In [4]:
reviews_subset_df = game_reviews.sample(frac=0.2, replace=False, random_state=42)

## Remove emoticons and digits

In [6]:
def emoticon_removal(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text) +
            ' '.join(emoticons).replace('-', ''))
    return text

reviews_subset_df['comment'] = reviews_subset_df['comment'].apply(emoticon_removal)

## Identify language and remove non-English reviews


In [7]:
reviews_subset_df.describe()

Unnamed: 0,rating,ID
count,589725.0,589725.0
mean,6.884386,89281.868015
std,1.796214,90794.354853
min,0.001,1.0
25%,6.0,6644.0
50%,7.0,42939.0
75%,8.0,163412.0
max,10.0,350992.0


In [None]:
for idx, row in reviews_subset_df.iterrows():
  try:
    lang = detect(row['comment'])
    if lang != 'en':
      reviews_subset_df.drop(idx, axis=0, inplace=True)
  except:
    language = 'error'
    reviews_subset_df.drop(idx, axis=0, inplace=True)

# Outputting a csv at this stage so that I do not have to run the lengthy 
# language process each time I run the worksheet
reviews_subset_df.to_csv(
    '/content/drive/MyDrive/Springboard DS/data/english_only_game_reviews.csv'
    )
reviews_subset_df.describe()

This row throws an error: rating                        7.3
comment    2016 5 8 6 7 8 7 7 7 8
ID                         176544
Name: 13285588, dtype: object
This row throws an error: rating                      5.0
comment    4 5 2 5 3 2 3 5 15 5
ID                       117914
Name: 15288017, dtype: object
This row throws an error: rating                         8.0
comment    3 5 3 5 3 5 4 3 5 18 25
ID                          258444
Name: 13660248, dtype: object
This row throws an error: rating                       7.0
comment    07 15 14 9 12 31 15 7
ID                         18905
Name: 17716328, dtype: object
This row throws an error: rating                         8.0
comment    4 3 5 4 3 5 4 5 19 5 25
ID                          136888
Name: 8078594, dtype: object
This row throws an error: rating                         7.0
comment    4 5 4 5 3 5 3 4 19 5 25
ID                          143986
Name: 10374456, dtype: object
This row throws an error: rating                     

Unnamed: 0,rating,ID
count,549455.0,549455.0
mean,6.868461,87768.55592
std,1.805443,90319.116182
min,0.001,1.0
25%,6.0,6249.0
50%,7.0,41002.0
75%,8.0,161936.0
max,10.0,350992.0


In [2]:
reviews_subset_df = pd.read_csv(
    '/content/drive/MyDrive/Springboard DS/data/english_only_game_reviews.csv',
    index_col=0)
reviews_subset_df = reviews_subset_df.sample(frac=0.25, replace=False, random_state=42)
reviews_subset_df.describe()

Unnamed: 0,rating,ID
count,137364.0,137364.0
mean,6.875884,88127.569742
std,1.801382,90262.010119
min,0.001,1.0
25%,6.0,6424.0
50%,7.0,41114.0
75%,8.0,161970.0
max,10.0,350736.0


## Identify corpus-specific stopwords (from EDA)

In [7]:
bgg_stop_words = ['game', 'play', 'like', 'fun', 'one', 'good', 'really',
                  'much', 'also', 'star', 'nostar', 'halfstar', 'boardgamegeek'
                  'know', 'want', 'games', 'played', 'just', 'need']

## Create stopwords set to use in CountVectorizer

In [8]:
full_set_stop_words = text.ENGLISH_STOP_WORDS.union(bgg_stop_words)

# Prepare test and training sets

In [9]:
X = reviews_subset_df.drop(['rating'], axis=1)
y = reviews_subset_df['rating']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
    )

## Create features

### Count vectorization, with stop words passed in


In [10]:
bigram_count = CountVectorizer(ngram_range=(2, 2),
                               decode_error='ignore',
                               binary=True, 
                               min_df=20, max_df=0.90,
                               token_pattern=r'\b[^\d\W]+\b',
                               stop_words=full_set_stop_words)

# The token pattern includes the removal of digits

bigram_count_mtx = bigram_count.fit_transform(X_train['comment'])
bigram_count_mtx_test = bigram_count.transform(X_test['comment'])

In [11]:
bigram_count_mtx[:10, :10]

<10x10 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [12]:
trigram_count = CountVectorizer(ngram_range=(3, 3),
                                decode_error='ignore',
                                min_df=20, max_df=0.90,
                                binary=True,
                                token_pattern=r'\b[^\d\W]+\b',
                                stop_words=full_set_stop_words)

trigram_count_mtx = trigram_count.fit_transform(X_train['comment'])
trigram_count_mtx_test = trigram_count.transform(X_test['comment'])

In [13]:
trigram_count_mtx[:10, :10]

<10x10 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [14]:
bigram_columns = bigram_count.get_feature_names_out()

bigram_word_counts = pd.DataFrame(bigram_count_mtx.toarray(), 
                                  columns=bigram_columns)

bigram_word_counts_test = pd.DataFrame(bigram_count_mtx_test.toarray(), 
                                       columns=bigram_columns)

bigram_word_counts.head(n=10)

Unnamed: 0,ability cards,able make,able use,able win,absolute blast,absolutely amazing,absolutely brilliant,absolutely fantastic,absolutely love,absolutely loved,...,yes s,young children,young kids,younger children,younger kids,younger players,youtube com,yr old,z man,zombie dice
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
trigram_columns = trigram_count.get_feature_names_out()

trigram_word_counts = pd.DataFrame(trigram_count_mtx.toarray(),
                                   columns=trigram_columns)

trigram_word_counts_test = pd.DataFrame(trigram_count_mtx_test.toarray(),
                                        columns=trigram_columns)

trigram_word_counts.head(n=10)

Unnamed: 0,action selection mechanism,ap prone players,aren t interesting,artwork graphic design,b pros b,b thing thing,b update b,bad dice rolls,bad didn t,bad don t,...,wouldn t recommend,wouldn t say,wouldn t turn,www boardgamegeek com,www vindjeu eu,www youtube com,year old daughter,year old loves,year old son,youtube com watch
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF vectorization, with stop words passed in

In [16]:
bigram_tfidf = TfidfVectorizer(ngram_range=(2, 2),
                               decode_error='ignore',
                               min_df=20, max_df=0.90,
                               token_pattern=r'\b[^\d\W]+\b',
                               stop_words=full_set_stop_words)

bigram_freq_mtx = bigram_tfidf.fit_transform(X_train['comment'])
bigram_freq_mtx_test = bigram_tfidf.transform(X_test['comment'])

In [17]:
bigram_freq_mtx[:10, :10]

<10x10 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [18]:
trigram_tfidf = TfidfVectorizer(ngram_range=(3, 3),
                                decode_error='ignore',
                                min_df=20, max_df=0.90,
                                token_pattern=r'\b[^\d\W]+\b',
                                stop_words=full_set_stop_words)

trigram_freq_mtx = trigram_tfidf.fit_transform(X_train['comment'])
trigram_freq_mtx_test = trigram_tfidf.transform(X_test['comment'])

In [19]:
trigram_freq_mtx[:10, :10]

<10x10 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [20]:
bigrams_columns = bigram_tfidf.get_feature_names_out()

bigram_word_counts = pd.DataFrame(bigram_freq_mtx.toarray(),
                                  columns=bigrams_columns)

bigram_word_counts_test = pd.DataFrame(bigram_freq_mtx_test.toarray(),
                                       columns=bigrams_columns)

bigram_word_counts.head(n=10)

Unnamed: 0,ability cards,able make,able use,able win,absolute blast,absolutely amazing,absolutely brilliant,absolutely fantastic,absolutely love,absolutely loved,...,yes s,young children,young kids,younger children,younger kids,younger players,youtube com,yr old,z man,zombie dice
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
trigrams_columns = trigram_tfidf.get_feature_names_out()

trigram_word_counts = pd.DataFrame(trigram_freq_mtx.toarray(),
                                   columns=trigrams_columns)

trigram_word_counts_test = pd.DataFrame(trigram_freq_mtx_test.toarray(),
                                        columns=trigrams_columns)

trigram_word_counts_test.head(n=10)

Unnamed: 0,action selection mechanism,ap prone players,aren t interesting,artwork graphic design,b pros b,b thing thing,b update b,bad dice rolls,bad didn t,bad don t,...,wouldn t recommend,wouldn t say,wouldn t turn,www boardgamegeek com,www vindjeu eu,www youtube com,year old daughter,year old loves,year old son,youtube com watch
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Export the vectorized output for use in the modeling step

Per SO, scipy has a way to save and load a sparse matrix: https://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format

In [22]:
sparse.save_npz("/content/drive/MyDrive/Springboard DS/data/bigram_count_mtx.npz", bigram_count_mtx)
sparse.save_npz("/content/drive/MyDrive/Springboard DS/data/bigram_count_mtx_test.npz", bigram_count_mtx_test)

sparse.save_npz("/content/drive/MyDrive/Springboard DS/data/trigram_count_mtx.npz", trigram_count_mtx)
sparse.save_npz("/content/drive/MyDrive/Springboard DS/data/trigram_count_mtx_test.npz", trigram_count_mtx_test)

sparse.save_npz("/content/drive/MyDrive/Springboard DS/data/bigram_freq_mtx.npz", bigram_freq_mtx)
sparse.save_npz("/content/drive/MyDrive/Springboard DS/data/bigram_freq_mtx_test.npz", bigram_freq_mtx_test)

sparse.save_npz("/content/drive/MyDrive/Springboard DS/data/trigram_freq_mtx.npz", trigram_freq_mtx)
sparse.save_npz("/content/drive/MyDrive/Springboard DS/data/trigram_freq_mtx_test.npz", trigram_freq_mtx_test)

### Export the columns for use in KMeans

In [23]:
np.save('/content/drive/MyDrive/Springboard DS/data/bigram_columns.npy', bigrams_columns)
np.save('/content/drive/MyDrive/Springboard DS/data/trigram_columns.npy', trigrams_columns)

Export the Y data

In [24]:
y_train.to_csv("/content/drive/MyDrive/Springboard DS/data/y_train.csv")
y_test.to_csv("/content/drive/MyDrive/Springboard DS/data/y_test.csv")