# AO3 DataFan: How Does a Work's Rating Effect Viewer Stats?

Submitted by a reader.

The old adage goes that sex sells on the internet, but does it hold true for fanworks on AO3?

## Imports and Preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as py_offline
import plotly.figure_factory as ff
import scipy.stats as stats
py_offline.init_notebook_mode()
from sklearn.preprocessing import MinMaxScaler

In [None]:
data = pd.read_csv('aothree.csv',encoding = "latin1")
#data.tail()

In [None]:
df = data.drop_duplicates(subset = ['ID'],  keep = 'last')

In [None]:
fandomlist = []
authorlist = []
chapterlist = []
postedlist = []
taglist = []
for index, row in df.iterrows():
    workid = row['ID']
    
    fandoms = row['FANDOMS']
    fandom = str(fandoms).split(' ||')
    #print(fandoms)
    for f in fandom:
        if f != '' and f !='nan':
            nf = f.split(' :: ')
            fandom_id ='{}_{}'.format(workid, nf[0].replace(' ', ''))
            fandom_name = nf[1]
            fandomlist.append([fandom_id, fandom_name, workid])
    
    authors = row['AUTHORS']
    #print(authors)
    author = str(authors).split(' ||')
    for a in author:
        if a != '' and a != 'nan':
            na = str(a).split(' :: ')
            author_id ='{}_{}'.format(workid, na[0].replace(' ', ''))
            author_name = na[1]
            authorlist.append([author_id, author_name, workid])
    
    
    chapters = row['CHAPTERS'] 
    if 'Jan' in chapters:
        c = chapters.replace('Jan', '1').replace('-','/')
    elif 'Feb' in chapters:
        c = chapters.replace('Feb', '2').replace('-', '/')
    elif 'Mar' in chapters:
        c = chapters.replace('Mar', '3').replace('-', '/')
    elif 'Apr' in chapters:
        c = chapters.replace('Apr', '4').replace('-','/')
    elif 'May' in chapters:
        c = chapters.replace('May', '5').replace('-', '/')
    elif 'Jun' in chapters:
        c = chapters.replace('Jun', '6').replace('-', '/')
    elif 'Jul' in chapters:
        c = chapters.replace('Jul', '7').replace('-', '/')
    elif 'Aug' in chapters:
        c = chapters.replace('Aug', '8').replace('-', '/')
    elif 'Sep' in chapters:
        c = chapters.replace('Sep', '9').replace('-', '/')
    elif 'Oct' in chapters:
        c = chapters.replace('Oct', '10').replace('-', '/')
    elif 'Nov' in chapters:
        c = chapters.replace('Nov', '11').replace('-', '/')
    elif 'Dec' in chapters:
        c = chapters.replace('Dec', '12').replace('-', '/')
    else:
        c = chapters
    #print(c)
    chapterlist.append(c)
    p = c.split('/')
    postedlist.append(float(p[0]))
    
    tags = row['FREEFORMS']
    tag = str(tags).split(' ||')
    for t in tag:
        if t != '' and t !='nan':
            nt = t.split(' :: ')
            tag_id ='{}_{}'.format(workid, nt[0].replace(' ', ''))
            tag_name = nt[1]
            taglist.append([tag_id, tag_name, workid])
    
    
df_fandom = pd.DataFrame(fandomlist, columns = ['FANDOM_ID', 'FANDOM_NAME', 'WORK_ID'])
df_author = pd.DataFrame(authorlist, columns = ['AUTHOR_ID', 'AUTHOR_NAME', 'WORK_ID'])
df_tags = pd.DataFrame(taglist, columns = ['FREEFORM_ID', 'FREEFORM_NAME', 'WORK_ID'])
df['CHAPTERS'] = chapterlist
df['POSTED'] = postedlist

## Work stats and Viewer Stats Distribution Across Rating

In [None]:
df_ratingvalues = pd.DataFrame()
ratings = df['RATING'].value_counts()
ratings = ratings.drop(['General Audiences, Teen And Up Audiences', 'No rating'])
df_ratingvalues = df_ratingvalues.append(ratings)

#distribution of Kudos across fic ratings
kudos = df.groupby(['RATING'])['KUDOS'].sum()
kudos = kudos.drop(['General Audiences, Teen And Up Audiences', 'No rating'])
df_ratingvalues = df_ratingvalues.append(kudos)

#distribution of comments across ratings
comments = df.groupby(['RATING'])['COMMENTS'].sum()
comments = comments.drop(['General Audiences, Teen And Up Audiences', 'No rating'])
df_ratingvalues = df_ratingvalues.append(comments)

#distribution of bookmarks across ratings
bookmarks = df.groupby(['RATING'])['BOOKMARKS'].sum()
bookmarks = bookmarks.drop(['General Audiences, Teen And Up Audiences', 'No rating'])
df_ratingvalues = df_ratingvalues.append(bookmarks)

#distribution of hits across ratings
hits = df.groupby(['RATING'])['HITS'].sum()
hits = hits.drop(['General Audiences, Teen And Up Audiences', 'No rating'])
df_ratingvalues = df_ratingvalues.append(hits)
df_ratingvalues

In [None]:
worktrace = go.Pie(labels = ratings.index, values = ratings.values, domain = dict(x = [0, .4]), hole = 0.4, name= "Works")
kudostrace = go.Pie(labels = kudos.index, values = kudos.values, domain = dict(x = [.42, .75], y = [0, .48]), hole = 0.4, name= "Kudos")
commenttrace = go.Pie(labels = comments.index, values = comments.values, domain = dict(x =[.77, 1], y = [0, .48]), hole = 0.4, name = "Comments")
booktrace = go.Pie(labels = bookmarks.index, values = bookmarks.values, domain = dict(x = [.42, .75], y = [.52, 1]), hole = 0.4, name = "Bookmarks")
hitrace = go.Pie(labels = hits.index, values = hits.values, domain = dict(x = [.77, 1], y = [.52, 1]), hole = 0.4, name= "Hits")

annotations = [dict(font = dict(size =20), x =.15, y =.5, text = "Works", showarrow =  False),
               dict(font = dict(size =14), x =.58, y =.8, text = "Bookmarks", showarrow =  False),
               dict(font= dict(size = 14), x =.58, y =.2, text = "Kudos", showarrow = False),
               dict(font= dict(size = 14), x =.9, y =.8, text = "Hits", showarrow = False),
               dict(font= dict(size = 14), x =.935, y =.2, text = "Comments", showarrow = False),]

layout= go.Layout(title = "Fanfiction Stats by Ratings", annotations = annotations)

py_offline.iplot(dict(data =[worktrace, kudostrace, commenttrace, booktrace, hitrace],layout = layout))

## Testing for Statistical Significance -- do these values make sense?

In [None]:
st_all = df[["RATING", "COMMENTS", "KUDOS", "BOOKMARKS", "HITS"]]

st_ga = df[df['RATING'] == 'General Audiences'][['COMMENTS', 'KUDOS', 'BOOKMARKS', 'HITS']]
st_te = df[df['RATING'] == 'Teen And Up Audiences'][['COMMENTS', 'KUDOS', 'BOOKMARKS', 'HITS']]
st_ma = df[df['RATING'] == 'Mature'][['COMMENTS', 'KUDOS', 'BOOKMARKS', 'HITS']]
st_ex = df[df['RATING'] == 'Explicit'][['COMMENTS', 'KUDOS', 'BOOKMARKS', 'HITS']]
st_nr = df[df['RATING'] == 'Not Rated'][['COMMENTS', 'KUDOS', 'BOOKMARKS', 'HITS']]


In [None]:
st = [st_ga, st_te, st_ma, st_ex, st_nr]
names = ['General Audiences', 'Teen And Up Audiences', 'Mature', 'Explicit', 'Not Rated']
colors = ["orange", "blue", "red", "green", "purple"]

In [None]:
st_com = st_all[["RATING", "COMMENTS"]]
st_kud = st_all[["RATING", "KUDOS"]]
st_boo = st_all[["RATING", "BOOKMARKS"]]
st_hit = st_all[["RATING", "HITS"]]

In [None]:
data = []
for num, n in enumerate(names):
    d = {
        "type": "violin",
        "x": st_com[st_com["RATING"] == n]["RATING"],
        "y": st_com[st_com["RATING"] == n]["COMMENTS"],
        "legendgroup": n,
        "scalegroup": n,
        "name": n,
        "box":{"visible": True},
        "meanline": {"visible": True},
        "line":{"color":colors[num]}
    }
    data.append(d)
#print(data)
layout = {"yaxis": {"zeroline": False}, "title": "Distribution of Comments Grouped By Fic Ratings<br>(All Fandoms)"}
fig = {"data": data, "layout":layout}
py_offline.iplot(fig)

## Do Big Fandoms (>2000 Works) Behave Different From Small Fandoms?

In [None]:
#join fandoms back into the works dataframe
fandoms = pd.merge(df, df_fandom, how = 'right', left_on = 'ID', right_on = 'WORK_ID')
fandoms = fandoms[["FANDOM_NAME", "HITS", "KUDOS", "COMMENTS", "BOOKMARKS", "RATING"]]
fandoms

In [None]:
bignamefandoms = fandoms["FANDOM_NAME"].value_counts()
bignamefandoms = bignamefandoms[bignamefandoms>2000].index

bigfandom = fandoms[fandoms["FANDOM_NAME"].isin(bignamefandoms)]
bigfandom

In [None]:
names = ['General Audiences', 'Teen And Up Audiences', 'Mature', 'Explicit', 'Not Rated']
colors = ["orange", "blue", "red", "green", "purple"]
data = []
for num, n in enumerate(names):
    d = {
        "type": "violin",
        "x": bigfandom[bigfandom["RATING"] == n]["RATING"],
        "y": bigfandom[bigfandom["RATING"] == n]["COMMENTS"],
        "legendgroup": n,
        "scalegroup": n,
        "name": n,
        "box":{"visible": True},
        "meanline": {"visible": True},
        "line":{"color":colors[num]}
    }
    data.append(d)
#print(data)
layout = {"yaxis": {"zeroline": False}, "title": "Distribution of Comments Grouped By Fic Ratings<br>(Fandom Works > 2000)"}
fig = {"data": data, "layout":layout}
py_offline.iplot(fig)

In [None]:
smallnamefandoms = fandoms["FANDOM_NAME"].value_counts()
smallnamefandoms = smallnamefandoms[smallnamefandoms<2000].index

smallfandom = fandoms[fandoms["FANDOM_NAME"].isin(smallnamefandoms)]
smallfandom

In [None]:
names = ['General Audiences', 'Teen And Up Audiences', 'Mature', 'Explicit', 'Not Rated']
colors = ["orange", "blue", "red", "green", "purple"]
data = []
for num, n in enumerate(names):
    d = {
        "type": "violin",
        "x": smallfandom[smallfandom["RATING"] == n]["RATING"],
        "y": smallfandom[smallfandom["RATING"] == n]["COMMENTS"],
        "legendgroup": n,
        "scalegroup": n,
        "name": n,
        "box":{"visible": True},
        "meanline": {"visible": True},
        "line":{"color":colors[num]}
    }
    data.append(d)
#print(data)
layout = {"yaxis": {"zeroline": False}, "title": "Distribution of Comments Grouped By Fic Ratings<br>(Fandom Works < 2000)"}
fig = {"data": data, "layout":layout}
py_offline.iplot(fig)

## Significance?

In [None]:
f_test = []
for each in names:
    f = st_all[st_all["RATING"] == each]["COMMENTS"]
    f_test.append(f)
stats.kruskal(f_test[0], f_test[1], f_test[2], f_test[3], f_test[4], nan_policy = 'omit')

In [None]:
#Are T and E rated fics significantly different in the amount of comments they get? What about NR and G?

te_st, te_p = stats.mannwhitneyu(f_test[1], f_test[3])
print(te_st, te_p)
gn_st, gn_p = stats.mannwhitneyu(f_test[0], f_test[4])
print(gn_st, gn_p)
#there is statistically no difference between comments and work rating.