# Preprocessing step

## Import of tools

In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

!pip install langdetect
from langdetect import detect



## Import cleaned datasets

In [None]:
game_reviews = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/cleaned_game_reviews.csv', index_col=0)
game_ratings = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/cleaned_game_ratings.csv', index_col=0)

In [None]:
game_reviews.describe()

Unnamed: 0,rating,ID
count,2948623.0,2948623.0
mean,6.886088,89184.77
std,1.796896,90728.63
min,1.4013e-45,1.0
25%,6.0,6607.0
50%,7.0,42776.0
75%,8.0,163412.0
max,10.0,350992.0


## Remove emoticons and digits

In [None]:
def emoticon_removal(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text) +
            ' '.join(emoticons).replace('-', ''))
    return text

game_reviews['comment'] = game_reviews['comment'].apply(emoticon_removal)

## Sample 20% of the data to make the dataframe small enough to work with

In [None]:
reviews_subset_df = game_reviews.sample(frac=0.2, replace=False, random_state=42)

## Identify language and remove non-English reviews


In [None]:
reviews_subset_df.describe()

Unnamed: 0,rating,ID
count,589725.0,589725.0
mean,6.884386,89281.868015
std,1.796214,90794.354853
min,0.001,1.0
25%,6.0,6644.0
50%,7.0,42939.0
75%,8.0,163412.0
max,10.0,350992.0


In [None]:
for idx, row in reviews_subset_df.iterrows():
  try:
    lang = detect(row['comment'])
    if lang != 'en':
      reviews_subset_df.drop(idx, axis=0, inplace=True)
  except:
    language = 'error'
    print("This row throws an error:", row)
    reviews_subset_df.drop(idx, axis=0, inplace=True)

# Outputting a csv at this stage so that I do not have to run the lengthy language process each time I run the worksheet
reviews_subset_df.to_csv('/content/drive/MyDrive/Springboard DS/data/english_only_game_reviews.csv')
reviews_subset_df.describe()

This row throws an error: rating                        7.3
comment    2016 5 8 6 7 8 7 7 7 8
ID                         176544
Name: 13285588, dtype: object
This row throws an error: rating                      5.0
comment    4 5 2 5 3 2 3 5 15 5
ID                       117914
Name: 15288017, dtype: object
This row throws an error: rating                         8.0
comment    3 5 3 5 3 5 4 3 5 18 25
ID                          258444
Name: 13660248, dtype: object
This row throws an error: rating                       7.0
comment    07 15 14 9 12 31 15 7
ID                         18905
Name: 17716328, dtype: object
This row throws an error: rating                         8.0
comment    4 3 5 4 3 5 4 5 19 5 25
ID                          136888
Name: 8078594, dtype: object
This row throws an error: rating                         7.0
comment    4 5 4 5 3 5 3 4 19 5 25
ID                          143986
Name: 10374456, dtype: object
This row throws an error: rating                     

Unnamed: 0,rating,ID
count,549455.0,549455.0
mean,6.868461,87768.55592
std,1.805443,90319.116182
min,0.001,1.0
25%,6.0,6249.0
50%,7.0,41002.0
75%,8.0,161936.0
max,10.0,350992.0


In [None]:
reviews_subset_df = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/english_only_game_reviews.csv', index_col=0)
reviews_subset_df.describe()

Unnamed: 0,rating,ID
count,549455.0,549455.0
mean,6.868461,87768.55592
std,1.805443,90319.116182
min,0.001,1.0
25%,6.0,6249.0
50%,7.0,41002.0
75%,8.0,161936.0
max,10.0,350992.0


## Identify corpus-specific stopwords (from EDA)

In [None]:
bgg_stop_words = ['game', 'play', 'like', 'fun', 'one', 'good', 'really', 'great', 'much', 'also', 'interesting', 'many']

## Create stopwords set to use in CountVectorizer

In [None]:
full_set_stop_words = text.ENGLISH_STOP_WORDS.union(bgg_stop_words)

# Prepare test and training sets

Even with 20% of the original data, the csvs output by the vectorizer are too large to import into the modeling workbook and the datasets are too large to run pairwise distance against. Trial and error determined I could take 5% of the remaining and still import the csv in the modeling sheet. 

In [None]:
reviews_sub_subset_df = reviews_subset_df.sample(frac=0.05, replace=False, random_state=42)
reviews_sub_subset_df.describe()

Unnamed: 0,rating,ID
count,27473.0,27473.0
mean,6.87124,88392.077312
std,1.803125,90614.840782
min,1.0,1.0
25%,6.0,6117.0
50%,7.0,41474.0
75%,8.0,163068.0
max,10.0,349805.0


In [None]:
X = reviews_sub_subset_df.drop(['rating'], axis=1)
y = reviews_sub_subset_df['rating']

# For use with the count vectorizer
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X, y, test_size=0.33, random_state=42)

# For use with the TF-IDF vectorizer
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X, y, test_size=0.33, random_state=42)

### Output the test data and the rating section of the training data for use in modeling step

In [None]:
y_train_cv.to_csv('/content/drive/MyDrive/Springboard DS/data/y_train_cv.csv')
y_test_cv.to_csv('/content/drive/MyDrive/Springboard DS/data/y_test_cv.csv')

y_train_tf.to_csv('/content/drive/MyDrive/Springboard DS/data/y_train_tf.csv')
y_test_tf.to_csv('/content/drive/MyDrive/Springboard DS/data/y_test_tf.csv')

## Create features

### Count vectorization, with stop words passed in


In [None]:
bigram_count_vectorizer = CountVectorizer(ngram_range=(2, 2),
                                          decode_error='ignore', 
                                          min_df=20, max_df=0.95,
                                          token_pattern=r'\b[^\d\W]+\b',
                                          stop_words=full_set_stop_words)

# The token pattern includes the removal of digits

bigram_cv = bigram_count_vectorizer.fit_transform(X_train_cv['comment'])
bigram_cv_test = bigram_count_vectorizer.transform(X_test_cv['comment'])

In [None]:
trigram_count_vectorizer = CountVectorizer(ngram_range=(3, 3),
                                           decode_error='ignore',
                                           min_df=20, max_df=0.95,
                                           token_pattern=r'\b[^\d\W]+\b',
                                           stop_words=full_set_stop_words)

trigram_cv = trigram_count_vectorizer.fit_transform(X_train_cv['comment'])
trigram_cv_test = trigram_count_vectorizer.transform(X_test_cv['comment'])

In [None]:
bigram_columns = bigram_count_vectorizer.get_feature_names_out()

bigram_word_counts_cv = pd.DataFrame(bigram_cv.toarray(), 
                                     columns=bigram_columns)

bigram_word_counts_cv_test = pd.DataFrame(bigram_cv_test.toarray(), 
                                     columns=bigram_columns)

In [None]:
trigram_columns = trigram_count_vectorizer.get_feature_names_out()

trigram_word_counts_cv = pd.DataFrame(trigram_cv.toarray(),
                                      columns=trigram_columns)

trigram_word_counts_cv_test = pd.DataFrame(trigram_cv_test.toarray(),
                                      columns=trigram_columns)

### Export the CountVectorizer output for use in the modeling step

In [None]:
bigram_word_counts_cv.to_csv('/content/drive/MyDrive/Springboard DS/data/bigram_cv.csv')
trigram_word_counts_cv.to_csv('/content/drive/MyDrive/Springboard DS/data/trigram_cv.csv')

In [None]:
bigram_word_counts_cv_test.to_csv('/content/drive/MyDrive/Springboard DS/data/bigram_cv_test.csv')
trigram_word_counts_cv_test.to_csv('/content/drive/MyDrive/Springboard DS/data/trigram_cv_test.csv')

### TF-IDF vectorization, with stop words passed in

In [None]:
bigram_tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2),
                                          decode_error='ignore',
                                          min_df=20, max_df=0.95,
                                          token_pattern=r'\b[^\d\W]+\b',
                                          stop_words=full_set_stop_words)

bigram_tf = bigram_tfidf_vectorizer.fit_transform(X_train_tf['comment'])
bigram_tf_test = bigram_tfidf_vectorizer.transform(X_test_tf['comment'])

In [None]:
trigram_tfidf_vectorizer = TfidfVectorizer(ngram_range=(3, 3),
                                           decode_error='ignore',
                                           min_df=20, max_df=0.95,
                                           token_pattern=r'\b[^\d\W]+\b',
                                           stop_words=full_set_stop_words)

trigram_tf = trigram_tfidf_vectorizer.fit_transform(X_train_tf['comment'])
trigram_tf_test = trigram_tfidf_vectorizer.transform(X_test_tf['comment'])

In [None]:
bigram_tf_columns = bigram_tfidf_vectorizer.get_feature_names_out()

bigram_word_counts_tf = pd.DataFrame(bigram_tf.toarray(),
                                     columns=bigram_tf_columns)

bigram_word_counts_tf_test = pd.DataFrame(bigram_tf_test.toarray(),
                                     columns=bigram_tf_columns)

In [None]:
trigram_tf_columns = trigram_tfidf_vectorizer.get_feature_names_out()

trigram_word_counts_tf = pd.DataFrame(trigram_tf.toarray(),
                                      columns=trigram_tf_columns)

trigram_word_counts_tf_test = pd.DataFrame(trigram_tf_test.toarray(),
                                      columns=trigram_tf_columns)

### Export the TF-IDF output for use in the modeling step

In [None]:
bigram_word_counts_tf.to_csv('/content/drive/MyDrive/Springboard DS/data/bigram_tf.csv')
trigram_word_counts_tf.to_csv('/content/drive/MyDrive/Springboard DS/data/trigram_tf.csv')

In [None]:
bigram_word_counts_tf_test.to_csv('/content/drive/MyDrive/Springboard DS/data/bigram_tf_test.csv')
trigram_word_counts_tf_test.to_csv('/content/drive/MyDrive/Springboard DS/data/trigram_tf_test.csv')