In [1]:
# READ DATA
import pandas as pd
dfs = pd.read_excel('OnlineRetail.xlsx', sheet_name='OnlineRetail')

In [2]:
# statistics of Data
dfs.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


In [3]:
#UNIQUE Values 
#exploring the unique values of each attribute
print("Number of transactions: ", dfs['InvoiceNo'].nunique())
print("Number of products bought: ",dfs['StockCode'].nunique())
print("Number of customers:", dfs['CustomerID'].nunique() )
print("Percentage of customers NA: ", round(dfs['CustomerID'].isnull().sum() * 100 / len(dfs),2),"%" )
print('Number of countries: ',dfs['Country'].nunique())



('Number of transactions: ', 25900)
('Number of products bought: ', 4070)
('Number of customers:', 4372)
('Percentage of customers NA: ', 24.0, '%')
('Number of countries: ', 38)


In [4]:


# Note: The number of NA customers is quite large and that would impact the results.

# This dataframe contains 8 variables that correspond to:

# InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.
# StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.
# Description: Product (item) name. Nominal.
# Quantity: The quantities of each product (item) per transaction. Numeric.
# InvoiceDate: Invoice Date and time. Numeric, the day and time when each transaction was generated.
# UnitPrice: Unit price. Numeric, Product price per unit in sterling.
# CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.
# Country: Country name. Nominal, the name of the country where each customer resides.

#Now, let's have an idea about the quantitative data (Quantity & UnitPrice).


In [12]:
# Data Pre-Processing Steps
######################------------------------------#####################
#-----------------------------------------------------------------------#


In [5]:
#######--------------------------------------------------------------------------------------------------------------#####
# Cancelled invoices

# As mentioned in the description of the dataset, some InvoiceNo start with the letter "c" = cancelled. Let's see if our hypothesis is correct about the negative quantity: -80995.
# We will look for the list of cancelled invoices and check if there is an invoice with that quantity.


In [6]:
#get canceled transactions
cancelled_orders = dfs[dfs['InvoiceNo'].astype(str).str.contains('C')]
cancelled_orders.head()



Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom


In [7]:
# How many canceled orders do we have?

In [8]:
#check how many rows our dataframe of cancelled orders contain
print("We have ",len(cancelled_orders), " cancelled orders.")
#percentage of cancelled orders in total orders
total_orders = dfs['InvoiceNo'].nunique()
cancelled_number = len(cancelled_orders)
print('Percentage of orders canceled: {}/{} ({:.2f}%) '.format(cancelled_number, total_orders, cancelled_number/total_orders*100))

('We have ', 9288, ' cancelled orders.')
Percentage of orders canceled: 9288/25900 (0.00%) 


In [9]:
#remove canceled orders
dfs = dfs[dfs['Quantity']>0]
dfs.shape

(531285, 8)

In [10]:
#remove rows where customerID are NA
dfs.dropna(subset=['CustomerID'],how='all',inplace=True)
dfs.shape

(397924, 8)

In [12]:
##### Build the Matrix [User,Items]
List_Unique_items = dfs['Description'].unique()
List_Unique_items = dfs['StockCode'].unique()

In [13]:
List_Unique_items.shape

(3665,)

In [14]:
List_Unique_users = dfs['CustomerID'].unique()
List_Unique_users.shape

(4339,)

In [15]:
###### Multiplying Quantity and UnitPrice columns to get a new column : AmountSpend########
dfs['AmountSpend'] = dfs['Quantity']*dfs['UnitPrice']

In [16]:
dfs.tail(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,AmountSpend
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France,10.2
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France,12.6
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France,16.6
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France,16.6
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France,14.85


In [17]:
#### Check if there is a negative AmountSpend in the filtered database
#  len(DataFrame.index) is equal to number of rows
negative_amountSpend = dfs[dfs['AmountSpend'] < 0]
len(negative_amountSpend.index)

0

In [18]:
###  Here we can see that there is no negative amount spend
## now to think about how to create User, Item matrix 
## My approach is to consider how much money did each User spent on each Item
## 


In [19]:
## Keeping relevant coluns from the Database
matrix_setup = dfs[['StockCode','CustomerID','AmountSpend']]
matrix_setup.shape

(397924, 3)

In [20]:
matrix_setup.describe()

Unnamed: 0,CustomerID,AmountSpend
count,397924.0,397924.0
mean,15294.315171,22.394749
std,1713.169877,309.055588
min,12346.0,0.0
25%,13969.0,4.68
50%,15159.0,11.8
75%,16795.0,19.8
max,18287.0,168469.6


In [21]:
List_Customers = dfs['CustomerID'].unique()
List_Items = dfs['StockCode'].unique()

In [22]:
import numpy as np
max_amt_spend = max(matrix_setup['AmountSpend'])
max_amt_spend

168469.6

In [23]:
a = []
a = matrix_setup.groupby('AmountSpend')
#a.count()

In [24]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.figure(figsize=(80, 20), dpi=180)

<matplotlib.figure.Figure at 0xd926eb0>

<matplotlib.figure.Figure at 0xd926eb0>

In [25]:
#plt.plot(a['CustomerID'])
plt.show()

In [27]:
matrix_setup.head()

Unnamed: 0,StockCode,CustomerID,AmountSpend
0,85123A,17850.0,15.3
1,71053,17850.0,20.34
2,84406B,17850.0,22.0
3,84029G,17850.0,20.34
4,84029E,17850.0,20.34


In [35]:
matrix_setup['Duplicate']=matrix_setup.duplicated(subset=['StockCode','CustomerID'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [46]:
matrix_setup.groupby('Duplicate').count()

Unnamed: 0_level_0,StockCode,CustomerID,AmountSpend
Duplicate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,266802,266802,266802
True,131122,131122,131122


In [48]:
matrix_setup['Total_amount'] = matrix_setup.groupby(['StockCode','CustomerID'])['AmountSpend'].transform('sum')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [68]:
#matrix_setup.groupby(['StockCode' =' 85123A','CustomerID' =' 17850.0'])
matrix_setup.loc[(matrix_setup['StockCode'] == '85123A') & (matrix_setup['CustomerID'] == 17850.0)]

Unnamed: 0,StockCode,CustomerID,AmountSpend,Duplicate,Total_amount
0,85123A,17850.0,15.3,False,315.9
49,85123A,17850.0,15.3,True,315.9
66,85123A,17850.0,15.3,True,315.9
278,85123A,17850.0,15.3,True,315.9
416,85123A,17850.0,20.4,True,315.9
3118,85123A,17850.0,15.3,True,315.9
3142,85123A,17850.0,15.3,True,315.9
3192,85123A,17850.0,15.3,True,315.9
3224,85123A,17850.0,15.3,True,315.9
3425,85123A,17850.0,15.3,True,315.9


In [71]:
# DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)[source]
mtarix_toGO = matrix_setup.drop_duplicates(subset = ['StockCode','CustomerID'], keep = 'first')

In [72]:
mtarix_toGO.loc[(matrix_setup['StockCode'] == '85123A') & (matrix_setup['CustomerID'] == 17850.0)]

Unnamed: 0,StockCode,CustomerID,AmountSpend,Duplicate,Total_amount
0,85123A,17850.0,15.3,False,315.9


In [74]:
mtarix_toGO.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266802 entries, 0 to 541908
Data columns (total 5 columns):
StockCode       266802 non-null object
CustomerID      266802 non-null float64
AmountSpend     266802 non-null float64
Duplicate       266802 non-null bool
Total_amount    266802 non-null float64
dtypes: bool(1), float64(3), object(1)
memory usage: 9.4+ MB


In [41]:
# implementing SVD and SVD++ with scikit-surprise
from surprise import SVD,SVDpp,evaluate

In [82]:
#The name SurPRISE (roughly :) ) stands for Simple Python RecommendatIon System Engine.
from surprise import SVD
from surprise.dataset import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.dataset import DatasetAutoFolds

In [42]:
# DatasetAutoFolds.split : 	Split the dataset into folds for future cross-validation.
#DatasetAutoFolds(ratings_file=None, reader=None, df=None)
# A derived class from Dataset for which folds (for cross-validation) are not predefined.
# (Or for when there are no folds at all).
#data = DatasetAutoFolds(ratings_file=None, reader=None, df=df)

In [43]:
#classmethod load_from_df(df, reader)
#reader = Reader(rating_scale=(0, 1))
#datac = Dataset.load_from_df(df=df,reader = None)

In [44]:
#def load_from_df(cls, df, reader):
#        """Load a dataset from a pandas dataframe.
 #       Use this if you want to use a custom dataset that is stored in a pandas
#        dataframe. See the :ref:`User Guide<load_from_df_example>` for an
 #       example.
 #       Args:
 #           df(`Dataframe`): The dataframe containing the ratings. It must have
 #               three columns, corresponding to the user (raw) ids, the item
  #              (raw) ids, and the ratings, in this order.
  #          reader(:obj:`Reader <surprise.reader.Reader>`): A reader to read
  #              the file. Only the ``rating_scale`` field needs to be
  #              specified.

In [75]:
#for i in matrix_setup['normalized']
#matrix_setup['normalized'] = (matrix_setup['AmountSpend'] - 22.39)/309
#max(matrix_setup['normalized'])

In [79]:
upper_bound = max(mtarix_toGO['Total_amount'])
upper_bound

168469.6

In [80]:
lower_bound = min(mtarix_toGO['Total_amount'])
lower_bound

0.0

In [83]:
# now can we use matrix_setup datafrmae for Surprise Libraryand use SVD,SVD++, NNMF

#define the reader
reader = Reader(rating_scale = (lower_bound,upper_bound))
#rating_scale(:obj:`tuple`, optional): The rating scale used for every rating.  Default is ``(1, 5)``.

# Assigning data as matrix_setup
data = Dataset.load_from_df(df=mtarix_toGO[['CustomerID','StockCode','Total_amount']],reader=reader)
#data = Dataset.load_from_df(df=matrix_setup[['CustomerID','StockCode','AmountSpend']],reader=None)


In [76]:
# try surprise SVD matrix factorization algorithm 
#algo = SVD()
#evaluate(algo, data, measures=['RMSE','MAE'])

In [46]:
# try surprise SVD matrix factorization algorithm 
#algo = SVDpp()
#evaluate(algo, data, measures=['RMSE','MAE'])

In [77]:
# try surprise SVD matrix factorization algorithm 
#from surprise import NMF
#algo = NMF()
#evaluate(algo, data, measures=['RMSE','MAE'])

In [None]:
# Use the famous SVD algorithm.
algo = SVD(n_factors = 100,n_epochs = 20,biased= True,init_mean = 0,init_std_dev = 0.1, lr_all = 0.005, reg_all = 0.02, verbose = True )


#MAE measures the average magnitude of the errors in a set of predictions, 
#without considering their direction. It’s the average over the test sample of the absolute differences between
#prediction and actual observation where all individual differences have equal weight.
#------------------------------------------------------------------------------------#
#RMSE is a quadratic scoring rule that also measures the average magnitude of the error. It’s the square root
#of the average of squared differences between prediction and actual observation.

# Run 5-fold cross-validation and print results.
value_list = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv = 5, verbose=True)

In [59]:
value_list['test_rmse']

array([545.13054242, 545.13249244, 545.13042167, 545.12879022,
         1.939414  ])

In [None]:
# Use the famous SVDpp algorithm.
algo = SVDpp(n_factors = 100,n_epochs = 20,init_mean = 0,init_std_dev = 0.1, lr_all = 0.005, reg_all = 0.02, verbose = True )


#MAE measures the average magnitude of the errors in a set of predictions, 
#without considering their direction. It’s the average over the test sample of the absolute differences between
#prediction and actual observation where all individual differences have equal weight.
#------------------------------------------------------------------------------------#
#RMSE is a quadratic scoring rule that also measures the average magnitude of the error. It’s the square root
#of the average of squared differences between prediction and actual observation.

# Run 5-fold cross-validation and print results.
value_list = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv = 5, verbose=True)

In [None]:
# Use the famous SVDpp algorithm.
algo = NMF(n_factors = 100,n_epochs = 20,init_mean = 0,init_std_dev = 0.1, lr_all = 0.005, reg_all = 0.02, verbose = True )


#MAE measures the average magnitude of the errors in a set of predictions, 
#without considering their direction. It’s the average over the test sample of the absolute differences between
#prediction and actual observation where all individual differences have equal weight.
#------------------------------------------------------------------------------------#
#RMSE is a quadratic scoring rule that also measures the average magnitude of the error. It’s the square root
#of the average of squared differences between prediction and actual observation.

# Run 5-fold cross-validation and print results.
value_list = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv = 5, verbose=True)