In [1]:
import pandas as pd
import utils as util
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.cluster import KMeans
from matplotlib import pyplot

In [2]:
def readDataSet():
    filename = const.DATA_SET_NAME
    if filename == "data.txt":
        list_of_products_sentiment = {}
        file_instance = open(filename, 'r')
        dataset = file_instance.read()
        dataset = dataset.split(",")
        for i in range(0, len(dataset)-1, 2):
            list_of_products_sentiment[dataset[i]] = int(dataset[i+1])

        return list_of_products_sentiment

    else:
        data_frame = pd.read_csv(filename)

        return data_frame

def vectorizing_data_frame(train, test):
    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
    train_matrix = vectorizer.fit_transform(train['Summary'])
    test_matrix = vectorizer.transform(test['Summary'])
    X_train = train_matrix
    X_test = test_matrix
    y_train = train['sentiment']
    y_test = test['sentiment']
    return (X_train, y_train), (X_test, y_test), vectorizer.get_feature_names_out()

def partitionDataFrame(data_frame):
    part_data_frame = data_frame.sample(frac = const.DATA_SET_PARTITION_SIZE)
    rest_data_frame = data_frame.drop(part_data_frame.index)
    return rest_data_frame

def dataFrameProcess(data_frame):
    data_frame = data_frame[data_frame['Score'] != 3]
    data_frame['sentiment'] = data_frame['Score'].apply(lambda rating : +1 if rating > 3 else -1)
    data_frame = data_frame.dropna(subset=['Summary'])
    return data_frame

def removePunctuation(text):
    cleanText = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"'))
    return cleanText

def splitDataSet(data_frame):
    index = data_frame.index
    data_frame['random_number'] = np.random.randn(len(index))
    train = data_frame[data_frame['random_number'] <= 0.8]
    test = data_frame[data_frame['random_number'] > 0.8]
    return train, test

In [3]:
data_frame = util.readDataSet()
data_frame = util.partitionDataFrame(data_frame)
data_frame = dataFrameProcess(data_frame)
data_frame['sentence'] = data_frame['Summary'].apply(util.removePunctuation)
train, test = splitDataSet(data_frame)

In [4]:
(X_train, y_train), (X_test, y_test), feature_names = vectorizing_data_frame(train, test)

In [5]:
X_train

<82913x14378 sparse matrix of type '<class 'numpy.int64'>'
	with 337496 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [7]:
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
predictions = lr.predict(X_test)
print(predictions)

[1 1 1 ... 1 1 1]


In [9]:
test['prediction'] = predictions

In [10]:
test['prediction'].value_counts()

 1    19677
-1     2664
Name: prediction, dtype: int64

In [11]:
test = test[test['prediction'] == 1]

In [12]:
data_frame

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,sentiment,sentence,random_number
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1,Good Quality Dog Food,0.352506
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...,1,Yay Barley,-0.726876
10,11,B0001PB9FE,A3HDKO7OW0QNK4,Canadian Fan,1,1,5,1107820800,The Best Hot Sauce in the World,I don't know if it's the cactus or the tequila...,1,The Best Hot Sauce in the World,-0.134863
13,14,B001GVISJM,A18ECVX2RJ7HUE,"willie ""roadie""",2,2,4,1288915200,fresh and greasy!,good flavor! these came securely packed... the...,1,fresh and greasy,1.030763
19,20,B001GVISJM,A3IV7CL2C13K2U,Greg,0,0,5,1318032000,Home delivered twizlers,Candy was delivered very fast and was purchase...,1,Home delivered twizlers,-1.454132
...,...,...,...,...,...,...,...,...,...,...,...,...,...
568434,568435,B003XUL27E,ABGQPE97ZVYJ3,Katherine Kelly,0,0,2,1306368000,Not so good,This soup is mostly broth. Although it has a k...,-1,Not so good,0.789485
568440,568441,B005ZC0RRO,A2TO5R8QLIITEF,SAK,1,1,5,1323734400,"Delicious, all natural and allergy free treats!",Indie Candy's gummies are absolutely delicious...,1,"Delicious, all natural and allergy free treats",-0.739993
568444,568445,B001EO7N10,A2SD7TY3IOX69B,"BayBay ""BayBay Knows Best""",3,3,5,1245369600,Best Value for Chinese 5 Spice,"As a foodie, I use a lot of Chinese 5 Spice po...",1,Best Value for Chinese 5 Spice,-1.528808
568445,568446,B001EO7N10,A2E5C8TTAED4CQ,S. Linkletter,2,2,5,1268006400,Five Spice Powder,"You can make this mix yourself, but the Star A...",1,Five Spice Powder,2.265428


In [13]:
new_data_frame = pd.DataFrame()
new_data_frame['product_id'] = test['ProductId']
new_data_frame['client_id'] = test['UserId']

In [14]:
new_data_frame.dropna()

Unnamed: 0,product_id,client_id
13,B001GVISJM,A18ECVX2RJ7HUE
69,B000E7VI7S,AWCBF2ZWIN57F
75,B001EPPI84,A27TZ4WBU7N0YF
93,B0019CW0HE,A3AF72GP4GVRY1
123,B003SE19UK,A9GRWS6KP8SMA
...,...,...
568392,B001EQ5O6Y,A1XDASQ60YMQLN
568400,B001EQ5O6Y,A3AIZS4BZXUKIP
568411,B0018CLWM4,AUX1HSY8FX55S
568417,B0000D16IP,A1SOL9F0I9D3A4


In [15]:
X = new_data_frame[['product_id','client_id']]

In [16]:
X.client_id = new_data_frame.client_id.astype('category').cat.codes
X.product_id = new_data_frame.product_id.astype('category').cat.codes

In [17]:
X['product_id'].value_counts()

7207     32
10597    31
7206     29
7208     27
6483     27
         ..
1159      1
10387     1
6731      1
6953      1
5215      1
Name: product_id, Length: 10977, dtype: int64

In [18]:
grouped_df = X.groupby('client_id')
list_of_client_liked_product_dict = {}
for key, item in grouped_df:
    dict = {}
    list_of_products = []
    client = grouped_df.get_group(key).iloc[0]['client_id']
    for data in grouped_df.get_group(key).values.tolist():
        list_of_products.append(data[0])
    dict[client] = list_of_products
    list_of_client_liked_product_dict.update(dict)

In [42]:
list_of_products = list(list_of_client_liked_product_dict.values())

In [44]:
for pr in list_of_products:
    print(pr)

[10439]
[10027]
[10026]
[10439]
[10026]
[9663]
[10027]
[10026]
[10244]
[10705]
[10026]
[10025]
[10026]
[9964]
[10439]
[10439]
[10439]
[10243]
[10864]
[10864]
[10439]
[10439]
[10025]
[10705]
[10439]
[10244]
[10671]
[8545]
[7956]
[886]
[4643]
[3341]
[3871]
[4034]
[2036]
[2492]
[1129, 1130]
[9699]
[10078]
[10712]
[13]
[9406]
[10968, 10313, 10295]
[9875]
[10965]
[6368]
[3876]
[2145]
[1604]
[10349]
[10771, 4480]
[8912]
[2762, 1522, 2097]
[3792]
[4163]
[5020, 2440]
[6867]
[10861]
[1116]
[6761]
[1218]
[4815]
[526]
[7092]
[6725]
[10214, 3261, 10215]
[6765]
[2728]
[4430]
[3604]
[10832]
[10233]
[9392]
[1991]
[465]
[7651]
[1273]
[7208]
[7622]
[2400]
[9131]
[8020]
[6493]
[10075]
[190]
[5042]
[2012]
[6379]
[1187]
[5135]
[9198]
[10727]
[1179]
[4964]
[5038]
[4412]
[10710]
[4894]
[10895]
[5776]
[1974]
[8289]
[7620]
[8298]
[4050]
[10613]
[7]
[4521]
[7613]
[7894]
[2947]
[3401]
[10236]
[8216]
[1009]
[5828]
[223]
[5025]
[2420]
[9812]
[8878, 10243]
[9202]
[6142]
[8837]
[9232]
[2260]
[10481]
[2833, 10496]
[

In [49]:
list_of_products = [[1,2,4,5], [1,4,9,10], [2,4,6,8], [1,2,4,8]]

In [36]:
def common(list_of_client_1, list_of_client_2):
    set_of_client_1 = set(list_of_client_1)
    set_of_client_2 = set(list_of_client_2)
    return set_of_client_1 & set_of_client_2 if set_of_client_1 & set_of_client_2 else []

def unCommon(list1, list2):
    result = [i for i in list1 if i not in list2]
    return set(result)

def calculateWeight(length_of_client_like_products, len_of_common_products):
    return float( 1 - ((length_of_client_like_products - len_of_common_products)/length_of_client_like_products))

def findMaxWeight(input_list):
    max_value = max(input_list)
    index_value = [index for index in range(len(input_list)) if input_list[index] == max_value]
    return index_value

In [37]:
def partition(lst, size):
    for i in range(0, len(lst), size):
        yield lst[i : i+size]

In [38]:
from threading import Thread
class ThreadWithReturnValue(Thread):
    def __init__(self, group=None, target=None, name=None,
                 args=(), kwargs={}, Verbose=None):
        Thread.__init__(self, group, target, name, args, kwargs)
        self._return = None

    def run(self):
        if self._target is not None:
            self._return = self._target(*self._args,
                                                **self._kwargs)
    def join(self, *args):
        Thread.join(self, *args)
        return self._return

In [39]:
def clustering(part_list_of_products, list_of_products):
    new_list = []
    for i in range(0, len(part_list_of_products)):
        for j in range(0, len(list_of_products)):
            if i == j:
                continue
            client_list = []
            weight = []
            dict = {}
            dict["i"] = common(part_list_of_products[i], list_of_products[j])
            dict["j"] = unCommon(part_list_of_products[i], dict["i"])
            weight.append(calculateWeight(len(part_list_of_products[i]), len(dict["i"])))
            client_list.append(dict)
            j += 1
        i += 1
        client_list.append(weight)
        new_list.append(client_list)
    return new_list

In [40]:
thread_list_of_products = list(partition(list_of_products, 100))

list_of_thread_object = []
for product_list in thread_list_of_products:
    twrv = ThreadWithReturnValue(target=clustering, args = (product_list, list_of_products))
    twrv.start()
    list_of_thread_object.append(twrv)

new_list = []
for thread in list_of_thread_object:
    thread_new_list = thread.join()
    new_list.extend(thread_new_list)

In [41]:
for data in new_list:
    print(data)

[{'i': [], 'j': {10439}}, [0.0]]
[{'i': [], 'j': {10027}}, [0.0]]
[{'i': [], 'j': {10026}}, [0.0]]
[{'i': [], 'j': {10439}}, [0.0]]
[{'i': [], 'j': {10026}}, [0.0]]
[{'i': [], 'j': {9663}}, [0.0]]
[{'i': [], 'j': {10027}}, [0.0]]
[{'i': [], 'j': {10026}}, [0.0]]
[{'i': [], 'j': {10244}}, [0.0]]
[{'i': [], 'j': {10705}}, [0.0]]
[{'i': [], 'j': {10026}}, [0.0]]
[{'i': [], 'j': {10025}}, [0.0]]
[{'i': [], 'j': {10026}}, [0.0]]
[{'i': [], 'j': {9964}}, [0.0]]
[{'i': [], 'j': {10439}}, [0.0]]
[{'i': [], 'j': {10439}}, [0.0]]
[{'i': [], 'j': {10439}}, [0.0]]
[{'i': [], 'j': {10243}}, [0.0]]
[{'i': [], 'j': {10864}}, [0.0]]
[{'i': [], 'j': {10864}}, [0.0]]
[{'i': [], 'j': {10439}}, [0.0]]
[{'i': [], 'j': {10439}}, [0.0]]
[{'i': [], 'j': {10025}}, [0.0]]
[{'i': [], 'j': {10705}}, [0.0]]
[{'i': [], 'j': {10439}}, [0.0]]
[{'i': [], 'j': {10244}}, [0.0]]
[{'i': [], 'j': {10671}}, [0.0]]
[{'i': [], 'j': {8545}}, [0.0]]
[{'i': [], 'j': {7956}}, [0.0]]
[{'i': [], 'j': {886}}, [0.0]]
[{'i': [], 'j': 

In [34]:
list_of_potential_likes = []

for i in range(0, len(new_list)):
    list_of_likes = []
    max_weight_list = findMaxWeight(new_list[i][-1])
    for weight_index in max_weight_list:
        for product in new_list[i][weight_index].get('i'):
            list_of_likes.append(product)
        
    list_of_potential_likes.append(set(list_of_likes))

print(list_of_potential_likes)

[set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set(), set()