In [1]:
import numpy as np
import pandas as pd
import random
import scipy

from random import shuffle
from scipy import stats

In [2]:
def generator():
    session = stats.geom.rvs(size = 4831, p = 0.3) # dataset has 4831 rows
    return session

user_session = generator()

In [3]:
count = {x: list(user_session).count(x) for x in user_session}
count

{8: 100,
 1: 1436,
 2: 1070,
 6: 241,
 3: 683,
 4: 491,
 5: 355,
 7: 157,
 11: 50,
 14: 15,
 18: 2,
 10: 65,
 9: 92,
 15: 7,
 13: 19,
 12: 21,
 21: 2,
 17: 4,
 19: 5,
 16: 11,
 20: 2,
 26: 1,
 22: 2}

# Generating User Id and Session Id

In [4]:
user_id = range(1, 4831)
user_id_session = list(zip(user_id, [10 * i for i in user_session]))

# number of articles served
articles = 0
for i in range(len(user_id_session)):
    articles -= -user_id_session[i][1]

print("Number of articles served:", articles)

Number of articles served: 160770


In [5]:
user_ids = []
for i in range(len(user_id_session)):
    for j in range(user_id_session[i][1]):
        user_ids.append(user_id_session[i][0])


session_list = list(user_session)
session_id = []
for i in session_list:
    for j in range(1, i+1):
        session_id.append([j for i in range(10)])

session_id = np.array(session_id).flatten()
user_session = list(zip(user_ids, session_id))

In [6]:
data = pd.DataFrame(user_session, columns = ['UserId', 'SessionId'])
print(data.shape)
data.head(11)

(160770, 2)


Unnamed: 0,UserId,SessionId
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [7]:
data.tail(11)

Unnamed: 0,UserId,SessionId
160759,4829,2
160760,4830,1
160761,4830,1
160762,4830,1
160763,4830,1
160764,4830,1
160765,4830,1
160766,4830,1
160767,4830,1
160768,4830,1


**UserId, SessionId generated sucessfully**

# ArticleId Served

In [8]:
article_id = []

for a in range(articles):
    article_id.append(np.random.randint(1, 4831))

len(article_id)

160770

**Difference between np.random.randint and random.randint**

* np.random.randint randomly initialises the whole array at once

* random.randint randomly initialises one element at a time

Reference - To serve articles randomly [shuffle in Python](https://pynative.com/python-random-shuffle/)

In [9]:
shuffle(article_id)
data['ArticleID Served'] = article_id
data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served
0,1,1,996
1,1,1,3043
2,1,1,4777
3,1,1,2180
4,1,1,4772


# Article Rank

In [10]:
article_rank = []
i = 1

for ar in range(articles):
    article_rank.append(i)
    i += 1
    if i > 10:
        i = 1

len(article_rank)

160770

In [11]:
data['Article Rank'] = article_rank
data.head(12)

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank
0,1,1,996,1
1,1,1,3043,2
2,1,1,4777,3
3,1,1,2180,4
4,1,1,4772,5
5,1,1,525,6
6,1,1,1979,7
7,1,1,868,8
8,1,1,3565,9
9,1,1,2091,10


# Click

To generate [Random booleans](https://xspdf.com/help/51744035.html) 

In [12]:
click_session = []

for c in range(articles):
    click_session.append(random.choice([True, False]))
    
len(click_session)

160770

In [13]:
data['Click'] = click_session
data.head(12)

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click
0,1,1,996,1,True
1,1,1,3043,2,False
2,1,1,4777,3,False
3,1,1,2180,4,True
4,1,1,4772,5,True
5,1,1,525,6,False
6,1,1,1979,7,True
7,1,1,868,8,False
8,1,1,3565,9,True
9,1,1,2091,10,False


# Time Spent

In [14]:
time_session = []

for t in range(articles):
    time_session.append(np.random.randint(10, 100)) # Least interesting story for 10 sec and most for 100 sec
    
len(time_session)

160770

In [15]:
for i in range(len(click_session)):
    if click_session[i] is True:
        time_session[i] = time_session[i]
    else:
        time_session[i] = 0

data['Time Spent (seconds)'] = time_session
data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds)
0,1,1,996,1,True,25
1,1,1,3043,2,False,0
2,1,1,4777,3,False,0
3,1,1,2180,4,True,51
4,1,1,4772,5,True,91


In [16]:
data.tail()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds)
160765,4830,1,3783,6,True,66
160766,4830,1,4531,7,True,83
160767,4830,1,2137,8,False,0
160768,4830,1,3865,9,False,0
160769,4830,1,3666,10,True,51


In [17]:
data.to_csv('user_profile.csv', index = False)

User profile completed

# Ratings

For collaborative filtering only.

In [18]:
ratings = []

for r in range(articles):
    ratings.append(np.random.randint(0, 6))
    
len(ratings)

160770

In [19]:
for i in range(len(click_session)):
    if click_session[i] is True:
        ratings[i] = ratings[i]
    else:
        ratings[i] = 0

data['Ratings (0 - 5)'] = ratings
data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds),Ratings (0 - 5)
0,1,1,996,1,True,25,5
1,1,1,3043,2,False,0,0
2,1,1,4777,3,False,0,0
3,1,1,2180,4,True,51,0
4,1,1,4772,5,True,91,2


In [20]:
data.tail()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds),Ratings (0 - 5)
160765,4830,1,3783,6,True,66,3
160766,4830,1,4531,7,True,83,0
160767,4830,1,2137,8,False,0,0
160768,4830,1,3865,9,False,0,0
160769,4830,1,3666,10,True,51,5


In [21]:
data.to_csv('Ratings.csv', index = False)