In [10]:
!pip install -q xlrd
!git clone https://github.com/skotak2/Review_Based_Recomm_System.git

Cloning into 'Review_Based_Recomm_System'...


In [None]:
# Mounting Google drive to this file to fetch the data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Installing scikit-learn and upgrading it
pip install --upgrade scikit-learn

In [1]:
#Importing the required set of packages
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import gzip
import os
import json
import gzip


In [5]:
#Flattening the JSON file to load the data into dataframe 
data = []
with gzip.open('/content/drive/My Drive/reviews_amazon.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))


In [6]:
#Loading the dataframe and selecting the required set of columns for modeling and Analysis
#Column 1 has the ASIN id which identifies the product and Column 4 has the reviews for the product
df_review = pd.DataFrame.from_dict(data)
df_review  = df_review.iloc[:,[1,4]]
df_review = df_review.dropna()

In [None]:
#We group the reviews and concatenate per ASIN ID for building the K-Means model
df_review = df_review.groupby(['asin'])['reviewText'].apply(','.join).reset_index()
df_review

Unnamed: 0,asin,reviewText
0,616719923X,Just another flavor of Kit Kat but the taste i...
1,9742356831,This curry paste makes a delicious curry. I j...
2,B00004S1C5,These dyes create awesome colors for kids craf...
3,B0000531B7,I really enjoy these bars as a quick breakfast...
4,B00005344V,"Traditional Medicinals' ""Breathe Easy"" is an a..."
...,...,...
8708,B00JGPG60I,We switched to this formula 5 days ago and for...
8709,B00JL6LTMW,We have enjoyed Larabar's variety of bars for ...
8710,B00K00H9I6,This 100% pure Canadian maple syrup is a Grade...
8711,B00KC0LGI8,I followed the directions on the box exactly b...


In [None]:
#Buidling a pipeline to extract TF-IDF scores of words from the given text devoid of stop words
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model',KMeans(n_clusters=100))])

In [None]:
#We fit the model pipeline with review texts and load the df_review dataframe with cluster predictions
model = pipeline.fit(df_review['reviewText'])
a = model.predict(df_review['reviewText'])
df_review['Prediction'] = a 

In [None]:
#We create a lookup dictionary where we asign each product with a cluster ID, we call it lkup_tbl
lkup_tbl = df_review.iloc[:,[0,2]]
lkup_dict = dict(zip(lkup_tbl.asin, lkup_tbl.Prediction))
lkup_dict


In [None]:
#We could validate the model to look for cluster IDs given the text data
text = ["I want to buy vegetables","I got sugar rush","Clean my washroom"]

# predict the label using the pipeline
model = pipeline.predict(text)
model

array([93, 14, 24], dtype=int32)

In [None]:
#Loading the libraries required for dumbing the lookup table and model in the joblib and dump files
#Joblib for pipeline and Dump for the lookup table
from joblib import load
from joblib import dump

In [None]:
#Dumping the pipeline
dump(pipeline, filename="text_classification1.joblib")

['text_classification1.joblib']

In [7]:
#Load the required libraries for extracting model and lookup file details
import pickle
from joblib import load

In [None]:
#Valdiation of pickle file
#Extracting from the pickle file
example_dict = lkup_dict

pickle_out = open("dict.pickle1","wb")
pickle.dump(example_dict, pickle_out)
pickle_out.close()


In [None]:
#Defining the function for performing the prediction
def recomm(text):
  pipeline = load("text_classification1.joblib")
  pickle_in = open("dict.pickle1","rb")
  example_dict = pickle.load(pickle_in)
  out_pred = pipeline.predict(text)
  list_recomm = [id for id, pred in example_dict.items() if pred == out_pred][:20]
  return list_recomm



In [None]:
#Recommendatopn for the given text in terms of Product IDs
recomm(["Harpic is bad"])

['B00005C2M2',
 'B0000DGF9V',
 'B0000SXEN2',
 'B00012182G',
 'B00013MY78',
 'B00015MLVA',
 'B0002VXZ40',
 'B0002Y121K',
 'B0005ZHOUO',
 'B0009IR4CK',
 'B000BBY7ZC',
 'B000CMHMUC',
 'B000EHP522',
 'B000EVLU9A',
 'B000EVMNLY',
 'B000EVMNOG',
 'B000EVOSE4',
 'B000EVOSH6',
 'B000EVOSHG',
 'B000EVQWJ8']