## Imports

In [1]:
import pandas as pd

## Load Data

In [2]:
articles_df = pd.read_csv('../data/shared_articles.csv')
articles_df.shape

(3122, 13)

In [3]:
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [4]:
articles_df.fillna('UNKOWN', inplace=True)

In [5]:
articles_df['contentType'].value_counts()

HTML     3101
VIDEO      11
RICH       10
Name: contentType, dtype: int64

In [6]:
articles_df[articles_df['contentId'] == -7423191370472335463].T.to_dict()

{2722: {'timestamp': 1478623412,
  'eventType': 'CONTENT SHARED',
  'contentId': -7423191370472335463,
  'authorPersonId': -4465926797008424436,
  'authorSessionId': -4234938118093547320,
  'authorUserAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
  'authorRegion': 'SP',
  'authorCountry': 'BR',
  'contentType': 'HTML',
  'url': 'https://medium.com/android-dev-br/espresso-intents-n%C3%A3o-%C3%A9-m%C3%A1gica-%C3%A9-tecnologia-1fcfc8f21d3b',
  'title': 'Espresso Intents: não é magia, é tecnologia! - Android Dev BR',
  'text': 'Se você leu meu último artigo sobre Testes unitários vs aceitação , já ficou bem claro como é difícil controlar o ambiente de teste. Existem diversas API\'s do próprio Espresso como ActivityMonitor e IdlingResources que nos ajudam nessa empreitada. Utilizando esses dois recursos, já conseguimos criar excelentes testes de aceitação que nos salvam a vida. Mas, nem tudo é mar de rosas. Imag

In [7]:
articles_df.shape

(3122, 13)

In [8]:
#No of Items
no_of_items = len(articles_df['contentId'].unique())
no_of_items

3057

In [9]:
#No of Users
no_of_users = len(articles_df['authorPersonId'].unique())
no_of_users

252

## Data Distribution across Time Periods

In [10]:
articles_df['datetime'] = pd.to_datetime(articles_df['timestamp'], unit='s')
articles_df['datetime'].describe()

  


count                    3122
unique                   3121
top       2016-04-27 15:56:07
freq                        2
first     2016-03-28 19:19:39
last      2017-02-28 18:51:11
Name: datetime, dtype: object

In [11]:
articles_df['year'] = articles_df['datetime'].dt.year
articles_df['quarter'] = articles_df['datetime'].dt.to_period("Q")
articles_df['month'] = articles_df['datetime'].dt.to_period("M")

In [12]:
articles_df['year'].value_counts()

2016    2921
2017     201
Name: year, dtype: int64

In [13]:
articles_df['quarter'].value_counts().sort_index()

2016Q1     134
2016Q2    1475
2016Q3     879
2016Q4     433
2017Q1     201
Freq: Q-DEC, Name: quarter, dtype: int64

In [14]:
articles_df['month'].value_counts().sort_index()

2016-03    134
2016-04    566
2016-05    467
2016-06    442
2016-07    335
2016-08    296
2016-09    248
2016-10    201
2016-11    113
2016-12    119
2017-01    120
2017-02     81
Freq: M, Name: month, dtype: int64

In [15]:
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang,datetime,year,quarter,month
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,UNKOWN,UNKOWN,UNKOWN,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,2016-03-28 19:19:39,2016,2016Q1,2016-03
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,UNKOWN,UNKOWN,UNKOWN,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,2016-03-28 19:39:48,2016,2016Q1,2016-03
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,UNKOWN,UNKOWN,UNKOWN,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en,2016-03-28 19:42:26,2016,2016Q1,2016-03
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,UNKOWN,UNKOWN,UNKOWN,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,2016-03-28 19:47:54,2016,2016Q1,2016-03
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,UNKOWN,UNKOWN,UNKOWN,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en,2016-03-28 19:48:17,2016,2016Q1,2016-03


## Type of Interactions

In [16]:
articles_df['eventType'].value_counts()

CONTENT SHARED     3047
CONTENT REMOVED      75
Name: eventType, dtype: int64

## Demographs

In [17]:
articles_df['authorCountry'].value_counts()

UNKOWN    2442
BR         613
US          59
CA           5
AU           2
PT           1
Name: authorCountry, dtype: int64

In [18]:
country_code_dict = {
    'BR': 'Brazil',
    'US': 'United States',
    'KR': 'South Korea',
    'CA': 'Canada',
    'JP': 'Japan',
    'AU': 'Australia',
    'GB': 'United Kingdom',
    'DE': 'Germany',
    'DE': 'Germany',
    'IE': 'Ireland',
    'IS': 'Iceland',
    'SG': 'Singapure',
    'AR': 'Argentina',
    'PT': 'Portugal',
    'IN': 'India',
    'ES': 'Spain',
    'IT': 'Italy',
    'MY': 'Malaysia',
    'CO': 'Colombia',
    'CN': 'China',
    'CL': 'Chile',
    'NL': 'Netherlands',
    'UNKOWN' : 'UNKOWN'
}

articles_df['authorCountryName'] = articles_df['authorCountry'].map(country_code_dict)

In [19]:
articles_df['authorCountryName'].value_counts()

UNKOWN           2442
Brazil            613
United States      59
Canada              5
Australia           2
Portugal            1
Name: authorCountryName, dtype: int64

In [20]:
articles_df['authorRegion'].value_counts()

UNKOWN    2442
SP         533
MG          75
NY          20
NJ          16
GA           7
IL           6
RJ           4
ON           4
NSW          2
TX           2
?            2
SC           2
RI           1
AB           1
FL           1
13           1
WA           1
CA           1
MS           1
Name: authorRegion, dtype: int64

In [21]:
lang_dict = {
    'en' : 'English',
    'pt' : 'Portuguese',
    'la' : 'Latin',
    'es' : 'Spanish',
    'ja' : 'Japanese',
    'UNKOWN' : 'UNKOWN'
}
articles_df['language'] = articles_df['lang'].map(lang_dict)

In [22]:
articles_df['lang'].value_counts()

en    2264
pt     850
la       4
es       2
ja       2
Name: lang, dtype: int64

In [23]:
articles_df['language'].value_counts()

English       2264
Portuguese     850
Latin            4
Spanish          2
Japanese         2
Name: language, dtype: int64