In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Stephen/Case Study 2/datasets/web_log_data.csv', na_filter=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5866 entries, 0 to 5865
Data columns (total 6 columns):
ip           5866 non-null object
date_time    5866 non-null object
request      5866 non-null object
step         5866 non-null int64
session      5866 non-null int64
user_id      5866 non-null int64
dtypes: int64(3), object(3)
memory usage: 275.0+ KB


In [2]:
print(df['request'].value_counts())

/                                                                821
/favicon.ico                                                     554
/robots.txt                                                      395
/eaglefarm/javascript/menu.js                                    370
/eaglefarm/pdf/Web_Price_List.pdf                                296
/eaglefarm/                                                      286
/services.html                                                   244
/eaglefarm/pricelist/                                            189
/eaglefarm/pricelist                                             187
/more.html                                                       145
/direct.html                                                     107
/eaglefarm/specials/                                             103
/eaglefarm/contact                                                95
/eaglefarm/contact/                                               93
/eaglefarm                        

In [4]:
df['date_time'] = pd.to_datetime(df['date_time'], format="%d/%b/%Y:%H:%M:%S")  # set date time to pandas datatime obj

df['Day'] = df['date_time'].dt.day
df['Month'] = df['date_time'].dt.month
df['Hour'] = df['date_time'].dt.hour
df['Day_of_week'] = df['date_time'].dt.dayofweek
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5866 entries, 0 to 5865
Data columns (total 10 columns):
ip             5866 non-null object
date_time      5866 non-null datetime64[ns]
request        5866 non-null object
step           5866 non-null int64
session        5866 non-null int64
user_id        5866 non-null int64
Day            5866 non-null int64
Month          5866 non-null int64
Hour           5866 non-null int64
Day_of_week    5866 non-null int64
dtypes: datetime64[ns](1), int64(7), object(2)
memory usage: 458.4+ KB


In [6]:
user1 = df[df['user_id'] == 3]
print(user1)

                        ip           date_time          request  step  \
0     c210-49-32-6.rochd2. 2005-04-18 21:25:07                /     1   
3408  c210-49-32-6.rochd2. 2005-04-18 21:25:08     /favicon.ico     2   
3976  c210-49-32-6.rochd2. 2005-04-18 21:25:55  /guarantee.html     5   
4119  c210-49-32-6.rochd2. 2005-04-18 21:25:39       /more.html     4   
5409  c210-49-32-6.rochd2. 2005-04-18 21:25:16   /services.html     3   

      session  user_id  Day  Month  Hour  Day_of_week  
0           3        3   18      4    21            0  
3408        3        3   18      4    21            0  
3976        3        3   18      4    21            0  
4119        3        3   18      4    21            0  
5409        3        3   18      4    21            0  


In [7]:
requestsByUser = df.groupby(['user_id'])['request'].apply(list)

print(requestsByUser.head(5))

user_id
1                                        [/robots.txt]
2                        [/code/Global/code/menu.html]
3    [/, /favicon.ico, /guarantee.html, /more.html,...
4                                        [/robots.txt]
5                           [/code/Ultra/services.htm]
Name: request, dtype: object


In [10]:
from apyori import apriori

# type cast the transactions from pandas into normal list format and run apriori
requestsByUser_list = list(requestsByUser)
results = list(apriori(requestsByUser_list, min_support=0.1))


def convert_apriori_results_to_pandas_df(results):
    rules = []

    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                          rule_set.support, rule.confidence, rule.lift])

            # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift'])


result_df = convert_apriori_results_to_pandas_df(results)
print(result_df.head(20))

                       Left_side                     Right_side   Support  \
0                                                             /  0.373904   
1                                                   /eaglefarm/  0.130480   
2                                 /eaglefarm/javascript/menu.js  0.143373   
3                                                  /favicon.ico  0.109335   
4                                                   /robots.txt  0.202166   
5                                                /services.html  0.120165   
6                              /                 /services.html  0.100567   
7                 /services.html                              /  0.100567   
8                    /eaglefarm/  /eaglefarm/javascript/menu.js  0.112429   
9  /eaglefarm/javascript/menu.js                    /eaglefarm/  0.112429   

   Confidence      Lift  
0    0.373904  1.000000  
1    0.130480  1.000000  
2    0.143373  1.000000  
3    0.109335  1.000000  
4    0.202166  1.00000

In [11]:
# sort all acquired rules descending by lift

result_df = result_df.sort_values(by='Confidence', ascending=False)
print(result_df.head(10))

                       Left_side                     Right_side   Support  \
8                    /eaglefarm/  /eaglefarm/javascript/menu.js  0.112429   
7                 /services.html                              /  0.100567   
9  /eaglefarm/javascript/menu.js                    /eaglefarm/  0.112429   
0                                                             /  0.373904   
6                              /                 /services.html  0.100567   
4                                                   /robots.txt  0.202166   
2                                 /eaglefarm/javascript/menu.js  0.143373   
1                                                   /eaglefarm/  0.130480   
5                                                /services.html  0.120165   
3                                                  /favicon.ico  0.109335   

   Confidence      Lift  
8    0.861660  6.009924  
7    0.836910  2.238301  
9    0.784173  6.009924  
0    0.373904  1.000000  
6    0.268966  2.23830