In [None]:
%pip install pandas
%pip install numpy
%pip install plotly
%pip install networkx
%pip install matplotlib

First, let’s import the dataset and get familiar with it. We will use the Pandas DataFrame to store and manipulate our dataset:

In [None]:
# importing module
import pandas as pd

# dataset
data = pd.read_csv("stack_network_links.csv")

# printing the shape of the dataset
data.shape

In [None]:
#printing the heading
data.head()

Let’s print out the top 10 most frequent items from the dataset.

In [None]:
# importing module
import numpy as np

# Gather All Items of Each Transactions into Numpy Array
transaction = []
for i in range(0, data.shape[0]):
    for j in range(0, data.shape[1]):
        transaction.append(data.values[i,j])

# converting to numpy array
transaction = np.array(transaction)

#  Transform Them a Pandas DataFrame
df = pd.DataFrame(transaction, columns=["items"]) 

# Put 1 to Each Item For Making Countable Table, to be able to perform Group By
df["incident_count"] = 1 

#  Delete NaN Items from Dataset
indexNames = df[df['items'] == "nan" ].index
df.drop(indexNames , inplace=True)

# Making a New Appropriate Pandas DataFrame for Visualizations  
df_table = df.groupby("items").sum().sort_values("incident_count", ascending=False).reset_index()

#  Initial Visualizations
df_table.head(10).style.background_gradient(cmap='Greens')

A treemapping is a method for displaying hierarchical data using nested figures, usually rectangles. We can use a treemap to visualize all the items from our dataset more interactive.

In [5]:
# importing required module
import plotly.express as px

# to have a same origin
df_table["all"] = "all" 

# creating tree map using plotly
fig = px.treemap(df_table.head(30), path=['all', "items"], values='incident_count',
                  color=df_table["incident_count"].head(30), hover_data=['items'],
                  color_continuous_scale='Greens',
                )
# ploting the treemap
fig.show()

Before getting the most frequent itemsets, we need to transform our dataset into a True – False matrix where rows are transactions and columns are most used

In [6]:
# importing the required module
from mlxtend.preprocessing import TransactionEncoder

# initializing the transactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transaction).transform(transaction)
dataset = pd.DataFrame(te_ary, columns=te.columns_)

# dataset after encoded
dataset

Unnamed: 0,#,+,-,.,0,1,2,3,4,5,...,q,r,s,t,u,v,w,x,y,z
0,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,True
1,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2,False,False,False,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,True,True,True,False,False,True,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1466,False,False,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1467,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1468,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [9]:
!#support! and A-Priori Algorithm

# importing the required module
from mlxtend.frequent_patterns import apriori, association_rules


# Extracting the most frequest itemsets via Mlxtend.
# The length column has been added to increase ease of filtering.
frequent_itemsets = apriori(dataset, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# printing the frequent itemset
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.019048,(#),1
1,0.097959,(-),1
2,0.386395,(.),1
3,0.278912,(0),1
4,0.273469,(1),1
...,...,...,...
14152,0.010884,"(m, y, -, w, e, o, i, a, k, n, t, r)",12
14153,0.010884,"(m, y, -, w, o, i, a, f, k, n, t, r)",12
14154,0.010884,"(m, y, e, -, w, o, i, f, k, n, t, r)",12
14155,0.010884,"(m, e, y, w, o, i, a, f, k, n, t, r)",12


In [10]:
# printing the frequntly items 
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.05) ]

Unnamed: 0,support,itemsets,length
40,0.061224,"(-, a)",2
43,0.078912,"(e, -)",2
46,0.074830,"(i, -)",2
51,0.051701,"(n, -)",2
55,0.072109,"(r, -)",2
...,...,...,...
335,0.102041,"(t, r)",2
336,0.063946,"(u, r)",2
337,0.057143,"(r, v)",2
342,0.103401,"(s, t)",2


We know that the association rules are simply the if-else statements. The IF component of an association rule is known as the antecedent. The THEN component is known as the consequent. The antecedent and the consequent are disjoint; they have no items in common.

So, let’s create antecedents and consequents:

In [11]:
#CONFIDENCE || LIFT

#  We set our metric as "Lift" to define whether antecedents & consequents are dependent our not
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
rules["antecedents_length"] = rules["antecedents"].apply(lambda x: len(x))
rules["consequents_length"] = rules["consequents"].apply(lambda x: len(x))
rules.sort_values("lift",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_length,consequents_length
1774170,"(m, -, y, o, i, f)","(e, a, k, t, r)",0.010884,0.010884,0.010884,1.000000,91.875000,0.010766,inf,6,5
1770289,"(-, y, w, i, a, k)","(m, e, f, n, t)",0.010884,0.010884,0.010884,1.000000,91.875000,0.010766,inf,6,5
1770276,"(e, -, y, w, i, n)","(m, a, f, k, t)",0.010884,0.010884,0.010884,1.000000,91.875000,0.010766,inf,6,5
1770278,"(e, -, y, w, a, f)","(m, i, k, n, t)",0.010884,0.010884,0.010884,1.000000,91.875000,0.010766,inf,6,5
1770279,"(e, -, y, w, a, k)","(m, i, f, n, t)",0.010884,0.010884,0.010884,1.000000,91.875000,0.010766,inf,6,5
...,...,...,...,...,...,...,...,...,...,...,...
19092,"(p, r)","(e, a)",0.088435,0.127891,0.013605,0.153846,1.202946,0.002295,1.030674,2,2
389,(j),(o),0.130612,0.164626,0.025850,0.197917,1.202221,0.004348,1.041505,1,1
388,(o),(j),0.164626,0.130612,0.025850,0.157025,1.202221,0.004348,1.031333,1,1
4253,(e),"(n, j)",0.295238,0.042177,0.014966,0.050691,1.201873,0.002514,1.008969,1,2


In [12]:
# Sort values based on confidence
rules.sort_values("confidence",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedents_length,consequents_length
0,(#),(c),0.019048,0.168707,0.019048,1.000000,5.927419,0.015834,inf,1,1
1122167,"(-, y, w, i, t, r)","(o, e, a)",0.010884,0.031293,0.010884,1.000000,31.956522,0.010544,inf,6,3
1122174,"(e, -, y, o, t, r)","(a, i, w)",0.010884,0.038095,0.010884,1.000000,26.250000,0.010470,inf,6,3
1122173,"(e, -, y, o, a, r)","(t, i, w)",0.010884,0.036735,0.010884,1.000000,27.222222,0.010485,inf,6,3
1122172,"(e, -, y, o, a, t)","(i, r, w)",0.010884,0.032653,0.010884,1.000000,30.625000,0.010529,inf,6,3
...,...,...,...,...,...,...,...,...,...,...,...
233717,(s),"(e, a, t, p, r)",0.318367,0.010884,0.010884,0.034188,3.141026,0.007419,1.024129,1,5
6254,(s),"(q, p)",0.318367,0.010884,0.010884,0.034188,3.141026,0.007419,1.024129,1,2
233598,(s),"(e, w, o, a, t)",0.318367,0.021769,0.010884,0.034188,1.570513,0.003954,1.012859,1,5
233287,(s),"(e, o, a, t, r)",0.318367,0.021769,0.010884,0.034188,1.570513,0.003954,1.012859,1,5
