### Creating data for our example

### Logistic Regression

In [2]:
import numpy as np
import pandas as pd
n_products = 5000

# Featured products
numpy_serie = np.array(np.random.randint(0, 2, size=(n_products, 26)))
df = pd.DataFrame(numpy_serie, columns=list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))

# New product
df.to_csv('product_features.csv', index_label=False)
print(df.head())

# New product
numpy_serie = np.array(np.random.randint(0, 2, size=(1, 25)))
df_target = pd.DataFrame(numpy_serie, columns=list('ABCDEFGHIJKLMNOPQRSTUVWXY'))
df_target.to_csv('product.csv', index_label=False)
print(df_target.head())

   A  B  C  D  E  F  G  H  I  J ...  Q  R  S  T  U  V  W  X  Y  Z
0  1  1  0  1  1  0  0  0  0  1 ...  0  0  1  0  1  1  1  0  0  0
1  0  1  0  1  1  0  0  0  1  1 ...  1  0  1  1  1  0  1  0  0  0
2  1  0  0  1  1  0  0  0  1  0 ...  1  1  0  1  1  1  0  1  1  0
3  1  1  0  0  1  0  1  1  1  1 ...  1  1  0  1  0  1  0  0  0  0
4  0  0  1  0  0  0  1  0  0  0 ...  0  1  1  0  1  1  1  0  1  0

[5 rows x 26 columns]
   A  B  C  D  E  F  G  H  I  J ...  P  Q  R  S  T  U  V  W  X  Y
0  0  1  1  1  1  1  1  1  0  0 ...  1  1  0  0  1  0  1  1  1  1

[1 rows x 25 columns]


In [18]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

new_client = pd.read_csv('product.csv')  # new client to be classified
product_featured = pd.read_csv('product_features.csv')  # featured data available

# DataFrame with columns from A to Z
X = product_featured.iloc[:, :-1].values  # features are columns from A-Y
y = product_featured.iloc[:, -1].values  # label is column Z

x = new_client.values  # what we want to classified

model = LogisticRegression()
model_trained = model.fit(X, y)
y_pred = model_trained.predict(x)
print(f"Result: {y_pred}")

Y_pred = model_trained.predict(X)
print(classification_report(y, Y_pred))
print("Is this person valid for this Product? {}".format("Yes" if y_pred[0] else "No"))

Result: [1]
             precision    recall  f1-score   support

          0       0.53      0.42      0.47      2440
          1       0.54      0.64      0.58      2560

avg / total       0.53      0.53      0.53      5000

Is this person valid for this Product? Yes


### K-Nearest Neighbors

In [21]:
import numpy as np
import pandas as pd
n_products = 50

# From A to Z are our features
# 100 products. each product has some features,
# of course most of the ti
# mes will be zero because a product just have few features
numpy_serie = np.array(np.random.randint(0, 4, size=(1, 26)))
df_target = pd.DataFrame(numpy_serie, columns=list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'), index=['myProduct'])
df_target.index.name = 'productName'

# Featured products
numpy_serie = np.array(np.random.randint(0, 4, size=(99, 26)))
df = pd.DataFrame(numpy_serie, columns=list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
df['productName'] = pd.util.testing.rands_array(5, 99)
df.set_index('productName', inplace=True)

df = df.append(df_target)
df.to_csv('product_features.csv', index_label=False)

# New product
df_target.to_csv('product.csv', index_label=False)

In [22]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# This is the number of closer products I want (neighbors). 
n_neighbors = 5

# This is the product that user is currently viewing
product = pd.read_csv('product.csv')

# This is the list of rated products. featured data
product_featured = pd.read_csv('product_features.csv')
X = product_featured.values

# so we need 5 more product_featured similar to the product
model = NearestNeighbors(n_neighbors=n_neighbors)
trained_model = model.fit(X)

result = trained_model.kneighbors(product)

print('Product to label')
print(product)
print('Product closest by features are {}'.format(list(result[1][0])))
print(product_featured.iloc[list(result[1][0])])

Product to label
           A  B  C  D  E  F  G  H  I  J ...  Q  R  S  T  U  V  W  X  Y  Z
myProduct  0  0  1  0  1  1  0  3  0  1 ...  2  0  3  1  1  3  3  3  2  1

[1 rows x 26 columns]
Product closest by features are [99, 86, 80, 88, 16]
           A  B  C  D  E  F  G  H  I  J ...  Q  R  S  T  U  V  W  X  Y  Z
myProduct  0  0  1  0  1  1  0  3  0  1 ...  2  0  3  1  1  3  3  3  2  1
XtthT      0  0  0  1  0  1  1  2  1  0 ...  2  0  2  2  3  1  3  1  0  1
DeBov      1  0  3  1  1  2  1  2  1  0 ...  3  3  3  1  1  2  1  3  3  3
uAMJs      0  2  3  2  1  1  1  1  0  2 ...  0  1  3  1  2  2  1  2  1  2
69yxP      0  3  2  2  2  2  1  3  0  1 ...  3  0  1  3  3  3  0  2  2  2

[5 rows x 26 columns]


### Collaborative filtering

In [30]:
import numpy as np
import pandas as pd

n_products = 50
n_people = 1000

df = pd.DataFrame()
df['userID'] = np.random.randint(0, n_people, size=10000)  # 1000 people
df['productID'] = np.random.randint(0, n_products, size=df.userID.size)  # 50 products
df['rating'] = np.random.randint(0, 4, size=df.userID.size)  # rating from 1-4

s = df.drop_duplicates(subset=['userID', 'productID'])
s.to_csv('product_ratings.csv', index_label=False)

df2 = pd.DataFrame()
df2['productName'] = pd.util.testing.rands_array(10, n_products)
df2.to_csv('product_list.csv', index_label=False)

In [45]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD

"""
1000 people with 50 products. We got 10000 recommendations
so our matrix will have many empty places. No worries, it is not a problem
"""
name = 'CNIwdVaBLH'
corr_value = 0.7

# model source
ratings_df = pd.read_csv('product_ratings.csv')
products_df = pd.read_csv('product_list.csv')

# user is interested in a ransom product from products_df
name = products_df['productName'][7]

# Utility  1000x50  1000 people x 50 product
ratings_pivot = pd.pivot_table(
    data=ratings_df,
    values='rating',
    index='userID',
    columns='productID',
    fill_value=0
)

# Transposing
X = ratings_pivot.T  # 50x1000  50 product x 1000 people

# Truncate to build a matrix with 'features' (12 but you can change) and get M = L x U
SVD = TruncatedSVD(n_components=12, random_state=17)
decomposed_matrix = SVD.fit_transform(X)  # L = 50x12  50 products x 12 features

# correlation matrix on L Matrix, we get 50x50, diagonal matrix
corr_mat = np.corrcoef(decomposed_matrix) 

# get the product id input by user
index_column = products_df[products_df['productName'] == name].index[0]
index_column == 7

# get the correlation values (vector) of the product respect to the rest,
# If user search for corr. of 5th product, the 5th position of the vector is 1 (diagonal)
product_corr_values = corr_mat[index_column]

# mask values of dataframe with values of corr higher than....
products_df[product_corr_values >= corr_value]

Unnamed: 0,productName
6,JwaxbMdmPd
7,w2LNC2XNS9
17,SP6k8dCCvY
21,uqQZNuVAyz
23,8euPUfKdEE
31,W1x89yPJ5s
45,DNmuxgYTMo
48,ZzPRC3eHg8
