In [18]:
import pandas as pd
import numpy as np
#!pip install tqdm
from tqdm import tqdm

This notebook accomplishes the following goals:

1. Creates a recommender class from scratch using collaborative filtering and jaccard similarity
2. Creates a data class to load and manipulate the E-Corp data sets
3. Loads and transforms E-Corp data; instantiates and trains item-item rec engine; generates, prints and saves recs

### Define Recommender Class

In [None]:
class Recommender:
    def __init__(self, data, user_col, item_cols, cf_method='item', similarity='pearson'):
        '''init Recommender class'''
        self.data = data
        self.user_col = user_col
        self.item_cols = item_cols
        self.cf_method = cf_method
        self.similarity = similarity
        self.similarity_matrix = []
        self.user_scores = []
        self.recs = []

    def create_similarity_matrix(self):
        '''creates correlation/similarity matrix for all items and stores result and self.similarity_matrix'''
        self.similarity_matrix = self._create_empty_df(self.cf_method)
        self._fill_similarity_matrix(self.similarity_matrix, self.similarity)

    def score_users(self, users=None):
        '''generates item ratings for each item for each user and stores result as self.user_scores'''
        if not users:
            # grab all users in data by default
            users = self.data.loc[:,self.user_col]
        cols = [self.user_col] + list(self.item_cols)
        user_data = self.data.loc[:,cols].set_index(self.user_col)
        self.user_scores = pd.DataFrame(index=users, columns=self.item_cols)
        self.user_scores = self.data[self.item_cols].dot(self.similarity_matrix)
                
    def score_new_users(self, users, user_data):
        '''generates item ratings for users passed in from external data set and stores result as self.user_scores'''
        cols = [self.user_col] + list(self.item_cols)
        self.user_scores = pd.DataFrame(index=user_data.index, columns=self.item_cols)
        self.user_scores = user_data.loc[self.item_cols].dot(self.similarity_matrix)        
                
    def generate_recs(self, users=None, num_recs=5):
        '''generates top num_rec recommendations for users and stores result as self.recs'''
        if not users:
            # grab all users in data by default
            users = self.data.loc[:,self.user_col]
        cols = ['Rec ' + str(x) for x in range(1,num_recs+1)] + ['Score ' + str(x) for x in range(1,num_recs+1)]
        self.recs = pd.DataFrame(index=users, columns=cols)
        progress_bar = tqdm(total = len(users), mininterval=5)
        for user in users:
            progress_bar.update()
            sorted_items = self.user_scores.sort_values(by=user, ascending=False, axis=1).loc[user,:].index
            for i in range(num_recs):
                item = sorted_items[i]
                item_col = cols[i]
                score_col = cols[i+num_recs]
                self.recs.loc[user, item_col] = item
                self.recs.loc[user, score_col] = self.user_scores.loc[user, item]
        self.recs.reset_index(inplace=True, drop=False)

    def print_recs(self):
        '''prints self.recs to stdout'''
        print(self.recs)
        
    def save_recs(self, filename='recommendations', format='excel'):
        '''saves self.recs to filename in specified format'''
        if format == 'excel':
            extension ='.xlsx'
            self.recs.to_excel(filename + extension, index=False)
        elif format == 'csv':
            extension += '.csv'
            self.recs.to_csv(filename + extension, index=False)
        else:
            raise ValueError('Invalid file format.  Please specify "excel" or "csv".')
  
    def _create_empty_df(self, cf_type):
        '''creates and returns empty df with users or items as rows and columns'''
        if cf_type == 'item':
            labels = self.item_cols
        elif cf_type == 'user':
            labels = self.data[user_col]
        else:
            raise ValueError('Invalid collaborative filtering technique.  Please specify "item" or "user".')
        return pd.DataFrame(index=labels, columns=labels)

    def _fill_similarity_matrix(self, similarity_matrix, similarity):
        '''calculates correlation between items using specified similarity and saves results in similarity_matrix
           valid similarity types: jaccard, pearson, cosine'''
        k=0
        item_df = self.data[self.item_cols] 
        #print(item_df)
        progress_bar = tqdm(total = similarity_matrix.shape[0], mininterval=5)
        for i in range(similarity_matrix.shape[0]):
            progress_bar.update()
            similarity_matrix.ix[i,i] = 1.0
            x = item_df.ix[:,i]
            for j in range(i,similarity_matrix.shape[1]):
                y = item_df.ix[:,j]
                similarity_matrix.ix[i,j] = self._get_similarity(x, y, similarity)
                similarity_matrix.ix[j,i] = similarity_matrix.ix[i, j]
                
    def _get_similarity(self, x, y, similarity):
        '''calculated specified correlation between two vectors and returns result'''
        if similarity == 'pearson':
            return self._pearson_similarity(x, y)
        elif similarity == 'jaccard':
            return self._jaccard_similarity(x, y)
        elif similarity == 'cosine':
            return self._cosine_similarity(x, y)
        else:
            raise ValueError('Invalid similarity type.  Please specify "cosine", "pearson", or "jaccard".')
        
    def _pearson_similarity(self, x, y):
        '''returns pearson correlation between x and y: covariance(x,y)/(std_dev(x)*std_dev(y))'''
        #effective if data can be transformed to normal distribution 
        pass

    def _jaccard_similarity(self, x, y):
        '''returns jaccard correlation between x and y: |intsection(x,y)|/|union(x,y)|'''
        #ideal for binary data, e.g. buy vs non-buy
        nonzero_x = set(np.nonzero(x)[0])
        nonzero_y = set(np.nonzero(y)[0])
        intersection_size = len(nonzero_x.intersection(nonzero_y))
        union_size = len(nonzero_x.union(nonzero_y))
        if union_size == 0:
            return 0
        else:
            return intersection_size/union_size

    def _cosine_similarity(self, x, y):
        '''returns cosine of angles between x and y'''
        pass
    

### Define Data Class

In [19]:
class Data:
    def __init__(self):
        '''init Data class'''
        self.data = None
        
    def load_data(self, filename, format='txt'):
        '''loads data from excel, csv, tsv or txt file'''
        if format == 'excel':
            self.data = pd.read_excel(filename)
        elif format == 'csv':
            self.data = pd.read_csv(filename)
        elif format == 'tsv':
            self.data = pd.read_csv(filename, sep='\t')
        elif format == 'txt':
            self.data = pd.read_table(filename)
        else:
            raise ValueError('Invalid file format. Please specify "excel","csv","tsv" or "txt".')
    
    def drop_small_orders(self, order_col = 'order_number', min_order_size=2):
        '''drop orders from self.data that have min_order_size or less unique items in basket'''
        self.data = self.data[self.data.groupby('order_number').order_number.transform(len) >= min_order_size]
    
    def expand_columns(self, columns=[]):
        '''performs one-hot encoding on specified columns and appends them to self.data'''
        dfs = []
        dfs.append(self.data)
        for col in columns:
            dfs.append(pd.get_dummies(self.data[col], prefix=None, sparse=False))
            self.data = pd.concat(dfs, axis=1)
    
    def drop_columns(self, columns=[]):
        '''drops columns from self.data'''
        self.data.drop(columns, axis=1, inplace=True)
    
    def consolidate_orders(self, order_col='order_number'):
        '''consolidates each order in self.data into single record. Order number is maintained and all other columns summed.'''
        data_cols = self.data.columns
        data_cols.drop(order_col)
        self.data = self.data.groupby(order_col).sum()[data_cols].reset_index()

### Set Data Flow

In [20]:
load_and_process_data = True
get_columns = True
run_rec_engine = True

### Load text file into DataFrame and process data

In [22]:
# drop orders with few items, one-hot encode l3 category information, drop unnecessary columns and consolidate unique orders into single records
if load_and_process_data:
    data = Data()
    data.load_data('/Users/sailalithasadhu/Desktop/DSDJ/Projects/ProductRecommendation/All Transations - 2 Weeks.txt', format = 'txt')
    data.drop_small_orders(order_col = 'order_number', min_order_size = 20)
    data.expand_columns(['l3'])
    data.drop_columns(['l1','l2','l3','sku','brand'])
    data.consolidate_orders(order_col='order_number')

KeyError: "['order_number'] not in index"

### Grad column names

In [21]:
if get_columns:
    user_col = 'order_number'
    item_cols = list(data.data.columns)
    item_cols.remove(user_col)

### Run recommendation engine and generate results

In [None]:
if run_rec_engine:
    rec_engine = Recommender(data.data, user_col=user_col, item_cols=item_cols, cf_method='item', similarity = 'jaccard')
    rec_engine.create_similarity_matrix()
    rec_engine.score_users()
    rec_engine.generate_recs()
    rec_engine.save_recs()
    rec_engine.print_recs()