In [1]:
import numpy
import pandas as pd
 
def matrix_factorization(R, P, Q, K, steps=50, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = numpy.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break

    return P, Q.T


In [2]:
R = [
     [5,3,0,1],
     [5,3,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]
R = numpy.array(R)
print(R)
N = len(R)
M = len(R[0])
K = 3
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
print()
nP, nQ = matrix_factorization(R, P, Q, K)
#print(nP)
#print(nQ.T)
nR = numpy.dot(nP, nQ.T)
print(nR)



[[5 3 0 1]
 [5 3 0 1]
 [1 1 0 5]
 [1 0 0 4]
 [0 1 5 4]]

[[1.02406624 1.21804589 0.72905581 0.9259288 ]
 [1.1041484  1.48534766 1.13493692 1.48951161]
 [0.73863923 0.75757629 0.58861905 0.77445869]
 [1.20801835 1.61730331 1.04926495 1.34823999]
 [0.88776297 1.2578356  1.21769255 1.63769617]]


In [5]:
R = pd.read_csv('history.csv')
cust_id=list(R['CUST_ID'][:10])
R=R.drop(['CUST_ID','CLV'],axis=1)
print(R)
R=R.head(10)
names = R.columns.values
R = numpy.array(R)
N = len(R)
M = len(R[0])
K = 100
P = numpy.random.rand(N,K)
Q = numpy.random.rand(M,K)
nP, nQ = matrix_factorization(R, P, Q, K)
nR = numpy.dot(nP, nQ.T)
df=pd.DataFrame(nR,index=cust_id,columns=names)
dff=df.apply(lambda row: row.sort_values(ascending=False).index, axis=1)
print(list(dff[1001]))
print(df)


    MONTH_1  MONTH_2  MONTH_3  MONTH_4  MONTH_5  MONTH_6
0       150       75      200      100      175       75
1        25       50      150      200      175      200
2        75      150        0       25       75       25
3       200      200       25      100       75      150
4       200      200      125       75      175      200
5        25      200      125      150       50       25
6       100      175      150      100       75       50
7       150       50       50       50       75      200
8       100      125      150       25      125       75
9       100       50      175       25      125      200
10       75       50        0      150       75      175
11      200      125       50      175      125      100
12       50      200       50      200       75        0
13      150      200       25      100       75      200
14      200      200      150      150      200      200
15      175      175      100       50      175       50
16      100      100      125  

In [2]:
userItemData = pd.read_csv('ratings.csv')
userItemData.head()
print(userItemData)
#Get list of unique items
itemList=list(set(userItemData["ItemId"].tolist()))

#Get count of users
userCount=len(set(userItemData["userId"].tolist()))
#Create an empty data frame to store item affinity scores for items.
itemAffinity= pd.DataFrame(columns=('item1', 'item2', 'score'))
rowCount=0

#For each item in the list, compare with other items.
for ind1 in range(len(itemList)):
    
    #Get list of users who bought this item 1.
    item1Users = userItemData[userItemData.ItemId==itemList[ind1]]["userId"].tolist()
    print("Item 1 ", item1Users)
    
    #Get item 2 - items that are not item 1 or those that are not analyzed already.
    for ind2 in range(ind1, len(itemList)):
        
        if ( ind1 == ind2):
            continue
       
        #Get list of users who bought item 2
        item2Users=userItemData[userItemData.ItemId==itemList[ind2]]["userId"].tolist()
        print("Item 2",item2Users)
        
        #Find score. Find the common list of users and divide it by the total users.
        commonUsers= len(set(item1Users).intersection(set(item2Users)))
        score=commonUsers / userCount

        #Add a score for item 1, item 2
        itemAffinity.loc[rowCount] = [itemList[ind1],itemList[ind2],score]
        rowCount +=1
        #Add a score for item2, item 1. The same score would apply irrespective of the sequence.
        itemAffinity.loc[rowCount] = [itemList[ind2],itemList[ind1],score]
        rowCount +=1
        
#Check final result
itemAffinity

    userId  ItemId
0     1001    5001
1     1001    5002
2     1001    5005
3     1002    5003
4     1002    5004
5     1002    5005
6     1003    5001
7     1003    5002
8     1004    5001
9     1004    5004
10    1004    5005
11    1005    5002
12    1005    5004
Item 1  [1001, 1003, 1004]
Item 2 [1001, 1003, 1005]
Item 2 [1002]
Item 2 [1002, 1004, 1005]
Item 2 [1001, 1002, 1004]
Item 1  [1001, 1003, 1005]
Item 2 [1002]
Item 2 [1002, 1004, 1005]
Item 2 [1001, 1002, 1004]
Item 1  [1002]
Item 2 [1002, 1004, 1005]
Item 2 [1001, 1002, 1004]
Item 1  [1002, 1004, 1005]
Item 2 [1001, 1002, 1004]
Item 1  [1001, 1002, 1004]


Unnamed: 0,item1,item2,score
0,5001.0,5002.0,0.4
1,5002.0,5001.0,0.4
2,5001.0,5003.0,0.0
3,5003.0,5001.0,0.0
4,5001.0,5004.0,0.2
5,5004.0,5001.0,0.2
6,5001.0,5005.0,0.4
7,5005.0,5001.0,0.4
8,5002.0,5003.0,0.0
9,5003.0,5002.0,0.0


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

ds = pd.read_csv("sample-data.csv")

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]

    # First item is the item itself, so remove it.
    # Each dictionary entry is like: [(1,2), (3,4)], with each tuple being (score, item_id)
    results[row['id']] = similar_items[1:]
    
print('done!')

done!


In [4]:
# hacky little function to get a friendly item name from the description field, given an item ID
def item(id):
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary. No real logic here.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    print(recs)
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

# Just plug in any item id here (1-500), and the number of recommendations you want (1-99)
# You can get a list of valid item IDs by evaluating the variable 'ds', or a few are listed below

recommend(item_id=88, num=15)

Recommending 15 products similar to 88  88  Merino 1 t-shirt...
-------
[(0.08447575320919214, 90), (0.06406059113132549, 89), (0.0522122089993198, 157), (0.04428725928007625, 60), (0.041372511234393936, 178), (0.035236979581731255, 20), (0.027044793859861947, 76), (0.027044793859861943, 125), (0.024886259891796998, 81), (0.02359474334918761, 196), (0.02296181367304754, 19), (0.022466803556223797, 87), (0.02224114988514357, 172), (0.022071452401068704, 166), (0.021931117552471735, 85)]
Recommended: 90  90  Merino 2 t-shirt (score:0.08447575320919214)
Recommended: 89  89  Merino 2 polo (score:0.06406059113132549)
Recommended: 157 157 All-out capris (score:0.0522122089993198)
Recommended: 60  60  Cap 1 graphic tee (score:0.04428725928007625)
Recommended: 178 178 Ditch the car-ride t-shirt (score:0.041372511234393936)
Recommended: 20  20  Cap 1 graphic t-shirt (score:0.035236979581731255)
Recommended: 76  76  L/s island hopper shirt (score:0.027044793859861947)
Recommended: 125 125 S/s is

In [1]:
 def frequently_bought(products,limit,filter_list):
        filter_list= ast.literal_eval(filter_list)
	current_ids=request.form.get("product_ids")
	current_website_id=request.form.get('current_website_id')
        set_list=index_set(filter_list,'fre')
        if current_ids:
              current_ids=ast.literal_eval(current_ids.encode("utf8"))
	      final_list=[]
              for sentence in current_ids:
		product_id=sentence
		df=pd.read_csv("AR_order_item.csv")
		df1=df.loc[df['product_id']==int(sentence)]
		uid=list(set(df1['order_id'].tolist()))
                iddf=df.loc[df['order_id'].isin(uid)]
		ordered_products=iddf['product_id'].tolist()
                print(Counter(ordered_products).most_common(2))
                pp=pd.DataFrame()
                pp['id']=[i[0] for i in Counter(ordered_products).most_common()]
		pp['fre']=[i[1] for i in Counter(ordered_products).most_common()]              
		productdf=pd.read_csv("AR_all.csv")
                pp = pp.merge(productdf, on='id', how="left").fillna(0)
                df =  pp.loc[(pp['is_salable'].str.contains('True')) & (pp['website_ids'].str.contains(str(current_website_id))) & (pp['status']==1) & (pp['visibility']!=1)]
                df=total(df)
                worddf=df.sort_values(set_list,ascending=False) 
                word=worddf['id'].tolist()
                word.remove(int(sentence))        
       		word=word[:int(limit)]	
		final_list.extend(word)
       	      return (final_list)

TabError: inconsistent use of tabs and spaces in indentation (<ipython-input-1-a26f7d933b7f>, line 3)

In [39]:
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.preprocessing import LabelBinarizer
 corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
 ]
 vectorizer = CountVectorizer()
 X = vectorizer.fit_transform(corpus)
 from sklearn.metrics.pairwise import cosine_similarity

 cosine_sim2 = cosine_similarity(X, X)
 print(X.toarray())
 print(cosine_sim2)
 print(vectorizer.get_feature_names())
 #print(X.toarray())  
 print('hdkahadkjsah')
 vectorizer1 = TfidfVectorizer()
 X1 = vectorizer1.fit_transform(corpus)
 cosine_sim2 = cosine_similarity(X1, X1)
 print(cosine_sim2)
 print(vectorizer1.get_feature_names())
 print(X1.toarray())
 print(X1)
 vectorizer2 = LabelBinarizer()
 X2 = vectorizer2.fit_transform(corpus)
# print(vectorizer2.get_feature_names())

 print(X2)

[[0 0 1 1 1 0 0 1 0 1]
 [0 0 2 0 1 0 1 1 0 1]
 [0 1 0 0 1 1 0 1 1 1]
 [1 0 1 1 1 0 0 1 0 1]]
[[1.         0.79056942 0.54772256 0.91287093]
 [0.79056942 1.         0.4330127  0.72168784]
 [0.54772256 0.4330127  1.         0.5       ]
 [0.91287093 0.72168784 0.5        1.        ]]
['3gb', 'and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
hdkahadkjsah
[[1.         0.64692568 0.30777187 0.8053723 ]
 [0.64692568 1.         0.22523955 0.52101602]
 [0.30777187 0.22523955 1.         0.24787094]
 [0.8053723  0.52101602 0.24787094 1.        ]]
['3gb', 'and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0.         0.         0.46979139 0.58028582 0.38408524 0.
  0.         0.38408524 0.         0.38408524]
 [0.         0.         0.6876236  0.         0.28108867 0.
  0.53864762 0.28108867 0.         0.28108867]
 [0.         0.51184851 0.         0.         0.26710379 0.51184851
  0.         0.26710379 0.51184851 0.26710379]
 [0.59276931 0.      