In [5]:
# directories and variables
data_dir = '../data/'
google_dir = data_dir + 'ga/'

metrics_f = google_dir + 'page_path_title_metrics.csv'

In [6]:
import pandas as pd
pd.options.display.max_colwidth = 100

from matplotlib import pyplot as plt
%matplotlib inline

In [7]:
metrics_df = pd.read_csv(metrics_f, skiprows=6)

# RENAME COLUMNS
adj_cols = ['year', 'country', 'page', 'title', 'new_users', 'uniq_pg_views', 'pg_views']
metrics_df.columns = adj_cols

metrics_df.head()

Unnamed: 0,year,country,page,title,new_users,uniq_pg_views,pg_views
0,2017,BD,/poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...,6950,8208,9898
1,2017,IN,/poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...,3501,4921,6416
2,2017,NP,/poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...,1845,2299,2799
3,2017,PK,/poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...,1318,1716,2102
4,2017,IN,/fiction-and-nonfiction-2017,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | The University of Iow...,1256,2019,2923


### Page Cleaning

In [10]:
def clean_page(pg):
    # some pages start with https
    if (pg.startswith('/https://')):
        pg = pg.split('courses/')[1]
    # otherwise most start with '/'
    elif (pg.startswith('/')):
        pg = pg[1:]
    
    # remove trailing '/'
    pg_wo_slash = pg.split('/')[0]
    
    # remove trailing '#'
    pg_wo_pound = pg_wo_slash.split('#')[0]
    
    # remove trailing '?'
    pg_wo_qmark = pg_wo_pound.split('?')[0]
    
    # manual reclass
    if (pg_wo_qmark == 'flash-write-2016-credit-1'):
        pg_wo_qmark = 'flash-write-2016'
        
    return pg_wo_qmark
        
metrics_df['page'] = metrics_df['page'].apply(clean_page)

### Title

In [8]:
title_df = metrics_df.title.value_counts().reset_index()
title_df.columns = ['title', 'cnt']

title_df.head()

Unnamed: 0,title,cnt
0,How Writers Write Fiction 2015 | The University of Iowa | NovoEd,1342
1,How Writers Write Fiction 2016: Storied Women | The University of Iowa | NovoEd,497
2,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | NovoEd,394
3,"Whitman’s Civil War: Writing and Imaging Loss, Death, and Disaster | The University of Iowa | No...",254
4,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...,233


In [11]:
title_df.iloc[0:100]

Unnamed: 0,title,cnt
0,How Writers Write Fiction 2015 | The University of Iowa | NovoEd,1342
1,How Writers Write Fiction 2016: Storied Women | The University of Iowa | NovoEd,497
2,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | NovoEd,394
3,"Whitman’s Civil War: Writing and Imaging Loss, Death, and Disaster | The University of Iowa | No...",254
4,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...,233
5,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | The University of Iow...,182
6,Power of the Pen: Identities and Social Issues in Poetry and Plays | NovoEd,152
7,"Whitman's Civil War: Writing and Imaging Loss, Death, and Disaster | The University of Iowa | No...",90
8,#Flashwrite Teen Poetry MOOC | The University of Iowa | NovoEd,81
9,How Writers Write Fiction 2015 - Sign In | NovoEd,63


In [12]:
len(title_df.index)

1229

In [13]:
metrics_df[['page', 'title']].iloc[:100]

Unnamed: 0,page,title
0,poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...
1,poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...
2,poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...
3,poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...
4,fiction-and-nonfiction-2017,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | The University of Iow...
5,how-writers-write-fiction-2016,How Writers Write Fiction 2016: Storied Women | The University of Iowa | NovoEd
6,how-writers-write-fiction-2016,How Writers Write Fiction 2016: Storied Women | The University of Iowa | NovoEd
7,poetry-and-plays-2017,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...
8,fiction-and-nonfiction-2017,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | The University of Iow...
9,fiction-and-nonfiction-2017,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | The University of Iow...


In [14]:
title_df.head(100)

Unnamed: 0,title,cnt
0,How Writers Write Fiction 2015 | The University of Iowa | NovoEd,1342
1,How Writers Write Fiction 2016: Storied Women | The University of Iowa | NovoEd,497
2,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | NovoEd,394
3,"Whitman’s Civil War: Writing and Imaging Loss, Death, and Disaster | The University of Iowa | No...",254
4,Power of the Pen: Identities and Social Issues in Poetry and Plays | The University of Iowa | No...,233
5,Power of the Pen: Identities and Social Issues in Fiction and Nonfiction | The University of Iow...,182
6,Power of the Pen: Identities and Social Issues in Poetry and Plays | NovoEd,152
7,"Whitman's Civil War: Writing and Imaging Loss, Death, and Disaster | The University of Iowa | No...",90
8,#Flashwrite Teen Poetry MOOC | The University of Iowa | NovoEd,81
9,How Writers Write Fiction 2015 - Sign In | NovoEd,63


In [26]:
def clean_title(title):
    
    # remove trailing website titles
    if (title.endswith(' | NovoEd')):
        title = title.split(' | NovoEd')[0].strip()
    if (title.endswith('| The University of Iowa')):
        title = title.split('| The University of Iowa')[0].strip()
        
    # remove sign in/sign up
    if (title.endswith('- Sign In')):
        title = title.split('- Sign In')[0].strip()
    if (title.endswith('- Sign Up')):
        title = title.split('- Sign Up')[0].strip()
        
    # many classes have XXX's Home, remove that to get the class instead of the user
    if ("'s Home | " in title):
        title = title.split("'s Home | ")[1]
        
    # remove the assignment preceeding many classes' pages
    if (' - ' in title):
        title = title.split(' - ')[-1]
    
    return title

title_df.title.apply(clean_title).tolist()

['How Writers Write Fiction 2015',
 'How Writers Write Fiction 2016: Storied Women',
 'Power of the Pen: Identities and Social Issues in Fiction and Nonfiction',
 'Whitman’s Civil War: Writing and Imaging Loss, Death, and Disaster',
 'Power of the Pen: Identities and Social Issues in Poetry and Plays',
 'Power of the Pen: Identities and Social Issues in Fiction and Nonfiction',
 'Power of the Pen: Identities and Social Issues in Poetry and Plays',
 "Whitman's Civil War: Writing and Imaging Loss, Death, and Disaster",
 '#Flashwrite Teen Poetry MOOC',
 'How Writers Write Fiction 2015',
 'Home page',
 'How Writers Write Fiction 2015',
 'How Writers Write Fiction 2015',
 'NovoEd',
 'How Writers Write Fiction 2015',
 '#Flashwrite Teen Poetry MOOC',
 'How Writers Write Fiction 2016: Storied Women',
 'How Writers Write Fiction 2015',
 '(not set)',
 'How Writers Write Fiction 2015',
 'Home Page',
 'sharebutton.to',
 'How Writers Write Fiction 2015',
 'How Writers Write Fiction 2016: Storied Wo

In [27]:
title_df['title_clean'] = title_df.title.apply(clean_title)
title_df.title_clean.value_counts()

How Writers Write Fiction 2015                                                            836
How Writers Write Fiction 2016: Storied Women                                             273
Whitman's Civil War: Writing and Imaging Loss, Death, and Disaster                         34
Whitman’s Civil War: Writing and Imaging Loss, Death, and Disaster                         33
#Flashwrite Teen Poetry MOOC                                                               21
Power of the Pen: Identities and Social Issues in Fiction and Nonfiction                    3
Power of the Pen: Identities and Social Issues in Poetry and Plays                          3
#Flashwrite Teen Poetry Workshop 1                                                          2
#Flashwrite Teen Poetry                                                                     1
Secret.ɢoogle.com                                                                           1
home                                                        