# Create a Matrix Representation of Genres Watched in 2 Programme Streams in January 2019

Load in Necessary Packages

In [50]:
import pandas as pd
import numpy as np

Connect to SVV and Query the table

In [51]:
import psycopg2

def get_svv_connection():
    con=psycopg2.connect(dbname = 'svv', host ='svv-rs-prod-bi.cjddijbnvfpr.eu-west-1.redshift.amazonaws.com', \
                         port = 5439, user = 'ryanw', password = 'hT6Y3TeZZUFdvLj')
    
    return con

In [52]:
query_data = "SELECT * FROM sandbox.rw_jan_19_2_programme_streams"

df_streams = pd.DataFrame()

try: 
    con = get_svv_connection()

    cur = con.cursor()
    cur.execute(query_data)

    colnames = [desc[0] for desc in cur.description]
    df_streams = pd.DataFrame(cur.fetchall(), columns = colnames)
    
except Exception as inst:
    print(inst)
finally:
    cur.close()
    con.close()

In [53]:
df_streams.head()

Unnamed: 0,visit_id,genre_id,min_rank,new_rank
0,00005ea6-012e-437a-a85f-0a088aa0e49c-154880193...,DRAMA,3,2
1,00007725-ea35-40a2-b10e-2db1d1f9eea2-154766872...,DRAMA,2,2
2,0000c367-a825-4794-b7a8-4299d21687e7-154774964...,DRAMA,1,1
3,0000df74-460c-4be0-bdcf-36ca773aaed7-154721454...,FACTUAL,3,2
4,0000df74-460c-4be0-bdcf-36ca773aaed7-154776043...,ENT,2,2


Get rid of the unnecessary column

In [54]:
df_streams = df_streams.drop("min_rank", axis = 1)

In [55]:
df_streams.head()

Unnamed: 0,visit_id,genre_id,new_rank
0,00005ea6-012e-437a-a85f-0a088aa0e49c-154880193...,DRAMA,2
1,00007725-ea35-40a2-b10e-2db1d1f9eea2-154766872...,DRAMA,2
2,0000c367-a825-4794-b7a8-4299d21687e7-154774964...,DRAMA,1
3,0000df74-460c-4be0-bdcf-36ca773aaed7-154721454...,FACTUAL,2
4,0000df74-460c-4be0-bdcf-36ca773aaed7-154776043...,ENT,2


Filter out the rows where new_rank = 1 to get a table containing the first programme for each visit_id

In [56]:
df_genre_1 = df_streams[df_streams.new_rank == 1]
df_genre_1.head()

Unnamed: 0,visit_id,genre_id,new_rank
2,0000c367-a825-4794-b7a8-4299d21687e7-154774964...,DRAMA,1
6,00012372-f830-5b52-8bc1-18d0a90f90d6-154774472...,DRAMA,1
9,0001cbc4-18c4-4cbd-97ce-f54921766e79-154688650...,ENT,1
10,0001cbc4-18c4-4cbd-97ce-f54921766e79-154739426...,ENT,1
16,000386b3-455c-40b8-8f51-179b803f8eab-154833696...,DRAMA,1


In [57]:
df_genre_1['visit_id'].nunique()

4129837

Do the same for new_rank = 2 and check that they are the same size.

In [58]:
df_genre_2 = df_streams[df_streams.new_rank == 2]
df_genre_2['visit_id'].nunique()

4129837

Join the two tables together on visit_id

In [59]:
df_merged = df_genre_1.merge(df_genre_2, on = 'visit_id')
df_merged.head()

Unnamed: 0,visit_id,genre_id_x,new_rank_x,genre_id_y,new_rank_y
0,0000c367-a825-4794-b7a8-4299d21687e7-154774964...,DRAMA,1,DRAMA,2
1,00012372-f830-5b52-8bc1-18d0a90f90d6-154774472...,DRAMA,1,DRAMA,2
2,0001cbc4-18c4-4cbd-97ce-f54921766e79-154688650...,ENT,1,ENT,2
3,0001cbc4-18c4-4cbd-97ce-f54921766e79-154739426...,ENT,1,DRAMA,2
4,000386b3-455c-40b8-8f51-179b803f8eab-154833696...,DRAMA,1,DRAMA,2


Get rid of the unnecessary columns, group by genre_id_x and genre_id_y and then count the number of visit_ids

In [60]:
df_pivot = df_merged[['genre_id_x','genre_id_y', 'visit_id']].groupby(['genre_id_x','genre_id_y']).count()

In [61]:
df_pivot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,visit_id
genre_id_x,genre_id_y,Unnamed: 2_level_1
CHILDREN,CHILDREN,53673
CHILDREN,COMEDY,4657
CHILDREN,DRAMA,6725
CHILDREN,ENT,9131
CHILDREN,FACTUAL,1130


In [62]:
df_pivot = df_pivot.reset_index()

In [63]:
df_pivot.head()

Unnamed: 0,genre_id_x,genre_id_y,visit_id
0,CHILDREN,CHILDREN,53673
1,CHILDREN,COMEDY,4657
2,CHILDREN,DRAMA,6725
3,CHILDREN,ENT,9131
4,CHILDREN,FACTUAL,1130


Pivot the df_pivot dataframe to convert into the required Matrix form

In [64]:
df_pivot.pivot(index = 'genre_id_x', columns = 'genre_id_y')

Unnamed: 0_level_0,visit_id,visit_id,visit_id,visit_id,visit_id,visit_id,visit_id,visit_id,visit_id,visit_id
genre_id_y,CHILDREN,COMEDY,DRAMA,ENT,FACTUAL,FILM,LIFESTYLE,NEWS,RELIGION,SPORT
genre_id_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
CHILDREN,53673.0,4657.0,6725.0,9131.0,1130.0,3053.0,290.0,170.0,1.0,142.0
COMEDY,5461.0,179604.0,26077.0,37782.0,5622.0,1431.0,561.0,519.0,10.0,493.0
DRAMA,6709.0,63346.0,1985077.0,451188.0,77455.0,11278.0,5673.0,7979.0,103.0,1818.0
ENT,10566.0,63885.0,291921.0,527069.0,61269.0,6424.0,9490.0,7549.0,109.0,2302.0
FACTUAL,976.0,7499.0,39196.0,49838.0,30545.0,1309.0,2014.0,1193.0,69.0,743.0
FILM,2389.0,1138.0,6358.0,4630.0,1227.0,5098.0,108.0,110.0,13.0,85.0
LIFESTYLE,178.0,606.0,3568.0,7161.0,1885.0,94.0,1526.0,178.0,5.0,85.0
NEWS,227.0,883.0,6420.0,10860.0,2341.0,160.0,306.0,3712.0,9.0,107.0
RELIGION,,5.0,60.0,75.0,49.0,17.0,6.0,4.0,,2.0
SPORT,149.0,568.0,1362.0,1771.0,609.0,75.0,82.0,86.0,,2326.0


genre_id_x is a label denoting the genre of first programme in the session with genre_id_y denoting the genre of the second programme.

check how long a session lasts - 4 hours
run for 6 months (1 year if this runs fine) - crashed computer
programme level (top 10 for that time period)