In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import scipy

from random import shuffle
from scipy import stats

In [2]:
def generator():
    session = stats.geom.rvs(size = 4831, p = 0.3) # dataset has 4831 rows
    return session

user_session = generator()

In [3]:
count = {x: list(user_session).count(x) for x in user_session}
count

{13: 23,
 2: 1014,
 1: 1423,
 6: 241,
 4: 503,
 3: 721,
 9: 75,
 5: 341,
 11: 40,
 10: 71,
 8: 145,
 15: 7,
 22: 1,
 7: 156,
 12: 32,
 14: 16,
 17: 7,
 16: 7,
 20: 3,
 19: 2,
 18: 3}

# Generating User Id and Session Id

In [4]:
user_id = range(1, 4831)
user_id_session = list(zip(user_id, [10 * i for i in user_session]))

# number of articles served
articles = 0
for i in range(len(user_id_session)):
    articles -= -user_id_session[i][1]

print("Number of articles served:", articles)

Number of articles served: 162700


In [5]:
user_ids = []
for i in range(len(user_id_session)):
    for j in range(user_id_session[i][1]):
        user_ids.append(user_id_session[i][0])


session_list = list(user_session)
session_id = []
for i in session_list:
    for j in range(1, i+1):
        session_id.append([j for i in range(10)])

session_id = np.array(session_id).flatten()
user_session = list(zip(user_ids, session_id))

In [6]:
data = pd.DataFrame(user_session, columns = ['UserId', 'SessionId'])
print(data.shape)
data.head(11)

(162700, 2)


Unnamed: 0,UserId,SessionId
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [7]:
data.tail(11)

Unnamed: 0,UserId,SessionId
162689,4830,4
162690,4830,5
162691,4830,5
162692,4830,5
162693,4830,5
162694,4830,5
162695,4830,5
162696,4830,5
162697,4830,5
162698,4830,5


**UserId, SessionId generated sucessfully**

# ArticleId Served

In [8]:
article_id = []

for a in range(articles):
    article_id.append(np.random.randint(1, 4831))

len(article_id)

162700

**Difference between np.random.randint and random.randint**

* np.random.randint randomly initialises the whole array at once

* random.randint randomly initialises one element at a time

Reference - To serve articles randomly [shuffle in Python](https://pynative.com/python-random-shuffle/)

In [9]:
shuffle(article_id)
data['ArticleID Served'] = article_id
data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served
0,1,1,4164
1,1,1,4404
2,1,1,3712
3,1,1,3644
4,1,1,4299


# Article Rank

In [10]:
article_rank = []
i = 1

for ar in range(articles):
    article_rank.append(i)
    i += 1
    if i > 10:
        i = 1

len(article_rank)

162700

In [11]:
data['Article Rank'] = article_rank
data.head(12)

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank
0,1,1,4164,1
1,1,1,4404,2
2,1,1,3712,3
3,1,1,3644,4
4,1,1,4299,5
5,1,1,3226,6
6,1,1,1765,7
7,1,1,2212,8
8,1,1,1254,9
9,1,1,4250,10


# Click

To generate [Random booleans](https://xspdf.com/help/51744035.html) 

In [12]:
click_session = []

for c in range(articles):
    click_session.append(random.choice([True, False]))
    
len(click_session)

162700

In [13]:
data['Click'] = click_session
data.head(12)

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click
0,1,1,4164,1,True
1,1,1,4404,2,False
2,1,1,3712,3,False
3,1,1,3644,4,False
4,1,1,4299,5,True
5,1,1,3226,6,True
6,1,1,1765,7,False
7,1,1,2212,8,False
8,1,1,1254,9,False
9,1,1,4250,10,False


# Time Spent

In [14]:
time_session = []

for t in range(articles):
    time_session.append(np.random.randint(10, 100)) # Least interesting story for 10 min and most for 100 min
    
len(time_session)

162700

In [15]:
for i in range(len(click_session)):
    if click_session[i] is True:
        time_session[i] = time_session[i]
    else:
        time_session[i] = 0

data['Time Spent (seconds)'] = time_session
data.head()

Unnamed: 0,UserId,SessionId,ArticleID Served,Article Rank,Click,Time Spent (seconds)
0,1,1,4164,1,True,29
1,1,1,4404,2,False,0
2,1,1,3712,3,False,0
3,1,1,3644,4,False,0
4,1,1,4299,5,True,46


In [16]:
data.to_csv('user_profile.csv', index = False)