<a href="https://colab.research.google.com/github/skssushil/Web-mining/blob/master/1_6_Apriori_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
from nltk.util import ngrams
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
import pandas as pd
import re
nltk.download('punkt');
nltk.download('stopwords');

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# read in data
corpus = []
dataSetFilename = 'text1.txt'
with open(dataSetFilename,'r') as file:
  corpus=file.read()

In [5]:
N=len(corpus)

In [6]:
N

27715

##Preprocessing 

In [7]:
def convert_lower_case(data):
  return np.char.lower(data)

In [8]:
def remove_stop_words(data):
  stop_words = stopwords.words('english')
  words = word_tokenize(str(data))
  new_text = ""
  for w in words:
    if w not in stop_words and len(w) > 1:
      new_text = new_text + " " + w
  return new_text

In [9]:
def remove_punctuation(data):
  symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
  for i in range(len(symbols)):
    data = np.char.replace(data, symbols[i], ' ')
    data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
  return data

In [10]:
def remove_apostrophe(data):
  return np.char.replace(data, "'", "")

In [11]:
def stemming(data):
  stemmer= PorterStemmer()  
  tokens = word_tokenize(str(data))
  new_text = ""
  for w in tokens:
    new_text = new_text + " " + stemmer.stem(w)
  return new_text

In [12]:
import inflect 
p = inflect.engine()   
# convert number into words 
def convert_number(text): 
  temp_str = text.split() 
  new_string = []  
  for word in temp_str:
    if word.isdigit(): 
      temp = p.number_to_words(word) 
      new_string.append(temp) 
    else:
      new_string.append(word) 
      temp_str = ' '.join(new_string) 
  return temp_str

In [13]:
def preprocess(data):
  data = convert_number(data)
  data = convert_lower_case(data)
  data = remove_punctuation(data) #remove comma seperately
  data = remove_apostrophe(data)
  data = remove_stop_words(data)
  data = stemming(data)
  data = remove_punctuation(data)
  data = stemming(data) #needed again as we need to stem the words
  data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
  data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
  data = convert_number(data)
  return data

In [14]:
tokens=word_tokenize(preprocess(corpus))

In [15]:
print(" Vocabs Length ""\n",len(tokens))

 Vocabs Length 
 2381


##Apriori

In [16]:
def createCandidateSet(data):
	cand = []
	for row in data:
		for itm in row:
			if [itm] not in cand:
				cand.append([itm])
	cand.sort()
	return list(map(frozenset,cand))

In [17]:
def scanData(data, candidateSet, minSupport):
	subsetCount = {}
	for curSet in data:
		for cand in candidateSet:
			if cand.issubset(curSet):
				if not cand in subsetCount:
					subsetCount[cand] = 1
				else:
					subsetCount[cand] += 1
	n = float(len(data))
	valid = []
	for key in subsetCount:
		sup = subsetCount[key]
		if sup >= minSupport:
			valid.insert(0,key)
	return valid, subsetCount

In [18]:
def genApriori(freqSets, k):
	valid = []
	nFreqSets = len(freqSets)
	for i in range(nFreqSets):
		for j in range(i+1, nFreqSets):
			lstCands1 = list(freqSets[i])[:k-2]
			lstCands2 = list(freqSets[j])[:k-2]
			lstCands1.sort()
			lstCands2.sort()
			# if first k-2 elements are equal
			if lstCands1 == lstCands2:
				valid.append(freqSets[i]|freqSets[j]) # union 
	return valid

In [19]:
def apriori(data, minSupport):
	candSet = createCandidateSet(data)
	setData = list(map(set,data))
	lstCands, subsetCounts = scanData(setData,candSet,minSupport)
	lstCands = [lstCands]
	k = 2
	while(len(lstCands[k-2]) > 0):
		candSetX = genApriori(lstCands[k-2],k)
		lstCandsX, subsetCountsX = scanData(setData,candSetX, minSupport)
		subsetCounts.update(subsetCountsX)
		lstCands.append(candSetX)
		k += 1
	return lstCands, subsetCounts

In [20]:
# read in data
data = []
dataSetFilename = 'text.txt'
with open(dataSetFilename,'r') as file:
	for line in file:
		data.append(line.strip().split(','))

In [21]:
print("What min. support do you want to use? ")
minSupp = input()
minSupp = int(minSupp)

What min. support do you want to use? 
2


In [22]:
print("\n**** Apriori with minSupport = {} ****".format(minSupp))
# call apriori
sets, counts = apriori(data,minSupp)
print("\nSets:")
"""for x in sets:
	for y in x:
		print(y)"""
print("\n Most Frequent items of size upto 5: \n")
for k,v in sorted(counts.items(), reverse=True, key=lambda tup: tup[1])[:20]:
	print(k, v)


**** Apriori with minSupport = 2 ****

Sets:

 Most Frequent items of size upto 5: 

frozenset({'BabyRuth'}) 49
frozenset({'Snickers'}) 47
frozenset({'Hershey'}) 43
frozenset({'AlmondJoy'}) 41
frozenset({'KitKat'}) 40
frozenset({'Snickers', 'Hershey'}) 40
frozenset({'BabyRuth', 'Snickers'}) 40
frozenset({'BabyRuth', 'Hershey'}) 39
frozenset({'BabyRuth', 'Snickers', 'Hershey'}) 36
frozenset({'HeathBar'}) 35
frozenset({'Twix'}) 35
frozenset({'MilkyWay'}) 34
frozenset({'Snickers', 'KitKat'}) 34
frozenset({'Cotton Candy'}) 33
frozenset({'BabyRuth', 'KitKat'}) 33
frozenset({'Cadbury'}) 32
frozenset({'Snickers', 'AlmondJoy'}) 32
frozenset({'BabyRuth', 'AlmondJoy'}) 31
frozenset({'Hershey', 'KitKat'}) 30
frozenset({'AlmondJoy', 'Hershey'}) 29
