In [4]:
import pandas as pd
import numpy as np
import math 
import datetime
import matplotlib.pyplot as plt

In [7]:
%matplotlib inline

# 1. DATA EXPLORATION

In [8]:
# The BBC would like to know audyence statistics that reflect how likely their audiences continue to consume their content

In [9]:
# Problem Definition
# The BBC would like to know if there's a correlation between audience viewing behaviour (what and when they have watched on iPlayer)
# and the amount of content they have consumed in a 14 day period i.e what content do they watch and where do they watch them from

# 1.1 DataSampling 

Toensure that any inferences you make from your dataset can be generalised to the real world it is important to ensure the data you choose for your dataset is indeed a good representation of the wider population it is sampled from and in particular free from systemic biases

In [10]:
# parse_dates parameter to initialise the start_date_time column as datetime datatype
data = pd.read_csv('iplayer_data_sample_janapr2017.csv', parse_dates=['start_date_time'])

In [11]:
data.shape

(490852, 8)

In [12]:
data.head()

Unnamed: 0,user_id,program_id,series_id,genre,programme_duration,streaming_id,start_date_time,time_viewed
0,GUNMFC,BUNBFKC,RFMHXZ,Factual,00:00:21,1486911129420_1,2017-02-12 15:21:24.544,20920.0
1,GUNMFC,SBTKDEW,EWBFGO,Comedy,00:01:51,1484864257965_1,2017-01-19 22:47:04.648,111285.0
2,GUNMFC,AXXRZDT,HFZOYT,Factual,00:00:30,1487099603980_1,2017-02-14 19:42:36.667,29945.0
3,GUNMFC,WEXFFOO,HZCOOC,Entertainment,00:01:22,1484773546557_1,2017-01-18 21:35:11.466,82620.0
4,GUNMFC,AUQGTSV,YMAXJD,Sport,00:01:37,1486911176609_1,2017-02-12 15:22:08.965,97444.0


In [14]:
data[['program_id', 'user_id', 'series_id', 'time_viewed']][data['series_id']=='RFMHXZ']

Unnamed: 0,program_id,user_id,series_id,time_viewed
0,BUNBFKC,GUNMFC,RFMHXZ,20920.0


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 490852 entries, 0 to 490851
Data columns (total 8 columns):
user_id               490852 non-null object
program_id            457154 non-null object
series_id             457154 non-null object
genre                 423133 non-null object
programme_duration    457154 non-null object
streaming_id          490852 non-null object
start_date_time       490852 non-null datetime64[ns]
time_viewed           488952 non-null float64
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 30.0+ MB


In [17]:
# we shall apply the below functions to our dataset columns to enhance our dataset

In [None]:
# Get the weekday: The weekday() is a datetime method that returns the day of the week as an int
# with Monday being 0 and sunday being 6

In [18]:
# the below function shall return the day of the week as a string given the datetime
def get_weekday(formatted_date):
    return 'weekday_' + str(formatted_date.weekday())

In [None]:
# Get the two week number: the isocalender() is an instance-method that returns a tuple conataining year, 
# weeknumber and weekday respectively given a date instance

In [19]:
# this below function returns the week number tuple( thats the reason behind [1]) divided by 2
def get_twoweeknumber(formatted_date):
    return math.floor(formatted_date.isocalendar()[1]/2.0)

In [20]:
# Get time of day: NB formatted_date.hour in the below function basically gets the hour attribute of the date
def get_timeofday(formatted_date):
    hour = formatted_date.hour
    
    if hour in range(5, 13):
        return 'Morning'
    elif hour in range(13,18):
        return 'Afternoon'
    elif hour in range(18, 23):
        return 'Evening'
    else:
        return 'Night'

In [27]:
# parse the programme duration and convert then into minutes
def parse_programme_duration(unformated_time):
    try:
        time_parts = unformated_time.split(':')
        return int(time_parts[0])*60 + int(time_parts[1]) + int(time_parts[2])/60
    except:
        return
    

In [28]:
# Apply the functions to the dataframe
data['weekday'] = data['start_date_time'].apply(get_weekday)
data['time_of_day'] = data['start_date_time'].apply(get_timeofday)
data['programme_duration_mins'] = data['programme_duration'].apply(parse_programme_duration)
data['twoweek'] = data['start_date_time'].apply(get_twoweeknumber)

data['min_watched'] = data['time_viewed']/(60000.0)

data.head(10)

Unnamed: 0,user_id,program_id,series_id,genre,programme_duration,streaming_id,start_date_time,time_viewed,weekday,time_of_day,programme_duration_mins,twoweek,min_watched
0,GUNMFC,BUNBFKC,RFMHXZ,Factual,00:00:21,1486911129420_1,2017-02-12 15:21:24.544,20920.0,weekday_6,Afternoon,0.35,3,0.348667
1,GUNMFC,SBTKDEW,EWBFGO,Comedy,00:01:51,1484864257965_1,2017-01-19 22:47:04.648,111285.0,weekday_3,Evening,1.85,1,1.85475
2,GUNMFC,AXXRZDT,HFZOYT,Factual,00:00:30,1487099603980_1,2017-02-14 19:42:36.667,29945.0,weekday_1,Evening,0.5,3,0.499083
3,GUNMFC,WEXFFOO,HZCOOC,Entertainment,00:01:22,1484773546557_1,2017-01-18 21:35:11.466,82620.0,weekday_2,Evening,1.366667,1,1.377
4,GUNMFC,AUQGTSV,YMAXJD,Sport,00:01:37,1486911176609_1,2017-02-12 15:22:08.965,97444.0,weekday_6,Afternoon,1.616667,3,1.624067
5,GUNMFC,UDRXGNE,ZLRAEZ,Comedy,00:08:30,1489351718862_1,2017-03-12 21:17:33.148,345700.0,weekday_6,Evening,8.5,5,5.761667
6,GUNMFC,NEMLWLO,ACUBNQ,Factual,00:17:23,1487099638088_2,2017-02-14 19:43:07.581,30038.0,weekday_1,Evening,17.383333,3,0.500633
7,VXAGMM,GPHKEFW,FGCABG,Drama,00:00:10,1488232316128_1,2017-02-27 22:21:59.069,2329.0,weekday_0,Evening,0.166667,4,0.038817
8,VXAGMM,TQEFFFO,JTALPD,Drama,00:00:29,1484436674473_2,2017-01-15 00:02:12.819,6807.0,weekday_6,Night,0.483333,1,0.11345
9,VXAGMM,PEJGSNX,ZBWYCA,Drama,00:00:30,1491166675449_1,2017-04-02 21:28:05.684,5703.0,weekday_6,Evening,0.5,6,0.09505


# 1.2 Exploratory Data Analysis

It refers to checking the quality of your data. Data sources can be noisy and often times data is missing for some observation or data can even be wrong. Cleaning your data involves both inferring missing values and identifying outliers. It involves generating descriptive statistics and visualising distributions

# 1.2.2 Data quality check

In [29]:
# check how many missing and unique values there are per column

In [45]:
# unique() is an instance method that return a list of the unique items for the given column instance
features = data.columns.values
for feature in features:
    print(feature, '- Missing value count', sum(data[feature].isnull()), '-Unique ', len(data[feature].unique()) )
    # if NaN present in the array returned by unique(), len() shall also count it
    

user_id - Missing value count 0 -Unique  9937
program_id - Missing value count 33698 -Unique  23857
series_id - Missing value count 33698 -Unique  5921
genre - Missing value count 67719 -Unique  12
programme_duration - Missing value count 33698 -Unique  891
streaming_id - Missing value count 0 -Unique  434430
start_date_time - Missing value count 0 -Unique  490626
time_viewed - Missing value count 1900 -Unique  259532
weekday - Missing value count 0 -Unique  7
time_of_day - Missing value count 0 -Unique  4
programme_duration_mins - Missing value count 33698 -Unique  891
twoweek - Missing value count 0 -Unique  10
min_watched - Missing value count 1900 -Unique  259532


In [44]:
len(data['genre'].unique())

12

In [50]:
data.head()

Unnamed: 0,user_id,program_id,series_id,genre,programme_duration,streaming_id,start_date_time,time_viewed,weekday,time_of_day,programme_duration_mins,twoweek,min_watched
0,GUNMFC,BUNBFKC,RFMHXZ,Factual,00:00:21,1486911129420_1,2017-02-12 15:21:24.544,20920.0,weekday_6,Afternoon,0.35,3,0.348667
1,GUNMFC,SBTKDEW,EWBFGO,Comedy,00:01:51,1484864257965_1,2017-01-19 22:47:04.648,111285.0,weekday_3,Evening,1.85,1,1.85475
2,GUNMFC,AXXRZDT,HFZOYT,Factual,00:00:30,1487099603980_1,2017-02-14 19:42:36.667,29945.0,weekday_1,Evening,0.5,3,0.499083
3,GUNMFC,WEXFFOO,HZCOOC,Entertainment,00:01:22,1484773546557_1,2017-01-18 21:35:11.466,82620.0,weekday_2,Evening,1.366667,1,1.377
4,GUNMFC,AUQGTSV,YMAXJD,Sport,00:01:37,1486911176609_1,2017-02-12 15:22:08.965,97444.0,weekday_6,Afternoon,1.616667,3,1.624067
