# Downloading datasets 

In [69]:
#Uncomment the below if using google drive to store files
from google.colab import drive
drive.mount('/content/drive')
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Video_Games.json.gz
!gunzip reviews_Video_Games.json.gz
!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_Video_Games.json.gz
!gunzip meta_Video_Games.json.gz


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--2022-03-30 18:35:02--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Video_Games.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 386419180 (369M) [application/x-gzip]
Saving to: ‘reviews_Video_Games.json.gz.4’


2022-03-30 18:36:05 (5.87 MB/s) - ‘reviews_Video_Games.json.gz.4’ saved [386419180/386419180]

gzip: reviews_Video_Games.json already exists; do you wish to overwrite (y or n)? n
	not overwritten
--2022-03-30 18:39:51--  http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_Video_Games.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, await

# Installing dependencies

In [70]:
pip install pyspark



In [71]:
pip install nltk



In [72]:
pip install rake_nltk



# Importing libraries and initializing spark

In [73]:
import os
import sys
import re #regex
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.functions import desc
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Content Based Recommendation") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

    

# Parsing data and removing unwanted columns


In [74]:
video_games = 'meta_Video_Games.json'

spark = init_spark()
df_raw = spark.read.json(video_games)
print("Before:\n")
print(df_raw.columns)

df1=df_raw.select("asin","title", "brand","category","description","feature")

df1.createOrReplaceTempView("items")
df2 = spark.sql("select asin,title, brand,category,description,feature from items LIMIT 15000")
print('\nAfter:\n')
print(df2.columns)

Before:

['also_buy', 'also_view', 'asin', 'brand', 'category', 'date', 'description', 'details', 'feature', 'fit', 'imageURL', 'imageURLHighRes', 'main_cat', 'price', 'rank', 'similar_item', 'tech1', 'tech2', 'title']

After:

['asin', 'title', 'brand', 'category', 'description', 'feature']


In [75]:
df=df2.toPandas()


# Text cleanup for display - replacing empty entries by None

- fixing HTML content and removing < tags >

In [76]:
def remove_tags(foo):
  pattern = '<\/?[a-zA-Z0-9.]*>'
  if type(foo) == list:
    temp = []
    for txt in foo:   
      txt = re.sub('&lt;', '<', txt)
      txt = re.sub('&gt;', '>', txt)
      txt = re.sub('&amp;', '&', txt)
      txt = re.sub('\b', '', txt)
      txt = re.sub('\n', '', txt)
      txt = re.sub(pattern, '', txt)
      if len(txt) > 1:
          temp.append(txt)
    return temp
  elif type(foo) == str:
      foo = re.sub('&lt;', '<', foo)
      foo = re.sub('&gt;', '>', foo)
      foo = re.sub('&amp;', '&', foo)
      foo = re.sub('\b', '', foo)
      foo = re.sub('\n', '', foo)
      foo = re.sub(pattern, '', foo)
      if len(foo) > 1:
        return(foo) 
  


df['category'] = df['category'].apply(remove_tags)
df['brand'] = df['brand'].apply(remove_tags)

df.head()

import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

'''
pd.set_option('display.max_columns', 100)
#df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
data = '/content/drive/MyDrive/BIG_DATA_WINTER/meta_Video_Games.json'
df1 = pd.read_json(data ,lines=True,nrows=5000)
df=df1
'''
df = df[['asin','title','category','brand','feature']]
df['feature'] = df['feature'].str[0]
df = df.astype(object).replace(np.nan, 'None')
df.head()


Unnamed: 0,asin,title,category,brand,feature
0,42000742,Reversi Sensory Challenger,"[Video Games, PC, Games]",Fidelity Electronics,
1,78764343,Medal of Honor: Warfighter - Includes Battlefi...,"[Video Games, Xbox 360, Games]",by EA Games,
2,276425316,street fighter 2 II turbo super nintendo snes ...,"[Video Games, Retro Gaming & Microconsoles, Su...",Nintendo,
3,324411812,Xbox 360 MAS STICK,"[Video Games, Xbox 360, Accessories, Controlle...",by MAS SYSTEMS,Original PCB used from Xbox 360 Control Pad (t...
4,439335310,Phonics Alive! 3: The Speller,"[Video Games, PC, Games, Grades 2-12, Spelling...",by Advanced Software Pty. Ltd.,Grades 2-12


# - Importing NLTK 
# - Removing punctuation and stopwords from the feature tab
# - Creating a bag of words from item description

In [77]:
# initializing the new column
from nltk import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

df['feature'].dropna(inplace=True)
tokens = df['feature'].apply(word_tokenize)


df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['feature']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['feature'], inplace = True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


- Creating a matrix with 'title' set as index

In [78]:
df.set_index('title', inplace = True)
df.head()

Unnamed: 0_level_0,asin,category,brand,Key_words
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Reversi Sensory Challenger,42000742,"[Video Games, PC, Games]",Fidelity Electronics,[none]
Medal of Honor: Warfighter - Includes Battlefield 4 Beta - Limited Edition,78764343,"[Video Games, Xbox 360, Games]",by EA Games,[none]
street fighter 2 II turbo super nintendo snes super nes video game,276425316,"[Video Games, Retro Gaming & Microconsoles, Su...",Nintendo,[none]
Xbox 360 MAS STICK,324411812,"[Video Games, Xbox 360, Accessories, Controlle...",by MAS SYSTEMS,"[original, pcb, used, xbox, 360, control, pad,..."
Phonics Alive! 3: The Speller,439335310,"[Video Games, PC, Games, Grades 2-12, Spelling...",by Advanced Software Pty. Ltd.,"[grades, 2, 12]"


- Creating bags of words from item descriptors

In [79]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        if col != 'Director':
            words = words + ' '.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    row['bag_of_words'] = words


- setting up an index lookup table

In [80]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the video game  titles so they are associated to an ordered numerical
# list to be used later to match the indexes
indices = pd.Series(df.index)
indices[:5]

0                           Reversi Sensory Challenger
1    Medal of Honor: Warfighter - Includes Battlefi...
2    street fighter 2 II turbo super nintendo snes ...
3                                   Xbox 360 MAS STICK
4                        Phonics Alive! 3: The Speller
Name: title, dtype: object

- Generating the cosine similarity matrix

In [81]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.80178373, 0.68376346, ..., 0.41239305, 0.71713717,
        0.45834925],
       [0.80178373, 1.        , 0.63960215, ..., 0.38575837, 0.67082039,
        0.42874646],
       [0.68376346, 0.63960215, 1.        , ..., 0.52636136, 0.85811633,
        0.58501794],
       ...,
       [0.41239305, 0.38575837, 0.52636136, ..., 1.        , 0.55205245,
        0.58218174],
       [0.71713717, 0.67082039, 0.85811633, ..., 0.55205245, 1.        ,
        0.61357199],
       [0.45834925, 0.42874646, 0.58501794, ..., 0.58218174, 0.61357199,
        1.        ]])

- defining the recommendations function to find items with similar features

In [82]:

def recommendations(title, cosine_sim = cosine_sim):
    recommended_items = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indexes = list(score_series.iloc[1:11].index)
    for i in top_10_indexes:
        recommended_items.append(list(df.index)[i])
    return recommended_items

# Testing

- testing item recommendation system:

In [83]:
#Just Test with one hard coded value
print("Generating more items like *Mickey's Ultimate Challenge - Nintendo Super NES*: \n")
recommendations('Mickey\'s Ultimate Challenge - Nintendo Super NES')

Generating more items like *Mickey's Ultimate Challenge - Nintendo Super NES*: 



['King of the Monsters - Nintendo Super NES',
 'Mohawk / Headphone Jack',
 'Super Mario Kart',
 'Lock On - Nintendo Super NES',
 "Mickey's Ultimate Challenge - Nintendo Super NES",
 'Tecmo Super Bowl III: Final Edition',
 'Radical Rex - Nintendo Super NES',
 'Wing Commander - Nintendo Super NES',
 'SimEarth: The Living Planet - Nintendo Super NES',
 'P.T.O. - Nintendo Super NES']

- count tracer

In [84]:
'''user_data_for_testing = 'meta_Video_Games.json'
spark =init_spark()
df_user = spark.read.json(user_data_for_testing)
df_user.createOrReplaceTempView("users_for_testing")
oin_tab=spark.sql("SELECT a.asin,a.title,b.reviewerID,b.reviewerName from items_for_testing a, users_for_testing b where a.asin=b.asin ")
oin_tab.count()
df_for_testing = spark.sql("select asin,title, brand,category,description,feature from items")
df_for_testing.createOrReplaceTempView("items_for_testing")
oin_tab.count()'''

'user_data_for_testing = \'meta_Video_Games.json\'\nspark =init_spark()\ndf_user = spark.read.json(user_data_for_testing)\ndf_user.createOrReplaceTempView("users_for_testing")\noin_tab=spark.sql("SELECT a.asin,a.title,b.reviewerID,b.reviewerName from items_for_testing a, users_for_testing b where a.asin=b.asin ")\noin_tab.count()\ndf_for_testing = spark.sql("select asin,title, brand,category,description,feature from items")\ndf_for_testing.createOrReplaceTempView("items_for_testing")\noin_tab.count()'

- def getrandom_user(): fetching a random user

In [85]:
from pyspark.sql.functions import rand 

def getrandom_user():

  spark=init_spark()
  df_for_testing = spark.sql("select asin,title, brand,category,description,feature from items LIMIT 15000")
  df_for_testing.createOrReplaceTempView("items_for_testing")
  user_data_for_testing = 'reviews_Video_Games.json'
  spark =init_spark()
  df_user = spark.read.json(user_data_for_testing)
  df_user.createOrReplaceTempView("users_for_testing")
  oin_tab=spark.sql("SELECT a.asin,a.title,b.reviewerID,b.reviewerName from items_for_testing a, users_for_testing b where a.asin=b.asin ")

  test_row=oin_tab.orderBy(rand()).limit(1)

  user = test_row.select(('reviewerName')).distinct().collect()[0][0]
  item_title = test_row.select(('title')).distinct().collect()[0][0]

  return user,item_title

# Test case with a random user 

- picks a random user then adds a suggestion based on a item they bought and rated highly

In [86]:
#item_title
random_user,random_title=getrandom_user()

- test case display

In [87]:
print("======================= output ===================")
print("User : " ,random_user)
print("Likes This Item Already  : " ,random_title)
print("May Also Like the below : ")
print("---")
recommendations(random_title)

User :  Michael Baker "randomthoughtvoid"
Likes This Item Already  :  Summoner
May Also Like the below : 
---


['Project Overkill - PlayStation',
 'Arctic Thunder',
 'Killing Zone',
 'SSX Tricky',
 'Project Overkill - PlayStation',
 'WCW vs. The World - Playstation',
 'Jeopardy',
 'Disney Golf',
 'SILPHEED:The Lost Planet',
 'The King of Route 66']

In [88]:
def test_execute():
  user,item=getrandom_user()
  print("======================= output ===================")
  print("User : " ,user)
  print("Likes This Item Already  : " ,item)
  print("May Also Like the below : ")
  print("---")
  print(recommendations(item))
  

In [89]:
test_execute()

User :  Jonathan
Likes This Item Already  :  Super Mario Land
May Also Like the below : 
---
['Super Mario Land', 'Kid Dracula', "Disney's Aladdin", "Pooh &amp; Tigger's Hunny Safari", 'Harvest Moon', 'VIP starring Pamela Anderson as Vallery Irons', 'WCW Mayhem', 'R-Type DX', "Dragon's Lair", 'Nascar Challenge']
