# Culture Measures Based on Company Reviews

In [7]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk, re, math, csv
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')
from string import punctuation
from functools import partial
import concurrent.futures as cf
from collections import defaultdict

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

[nltk_data] Downloading package wordnet to /home/ramon/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /home/ramon/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/ramon/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Load your dataset.

In [3]:
df = pd.read_csv('netflix.csv', parse_dates=True)
df.head()

Unnamed: 0,reviewID,employerID,userID,gender,birthYear,highestEducation,metroID,metroName,stateID,stateName,countryID,jobTitleID,JobTitle,GOC,GOCconfidence,MGOC,MGOCconfidence,reviewDateTime,isCurrentJobFlag,jobEndingYear,OverallRating,CareerOpps,CompensationBenefits,SeniorLeadership,Worklife,CultureValues,RecommendFriend,BusinessOutlook,CEO,employerName,stockTicker,employerTypeCode,numberEmployees,annualRevenue,industry,sector,pros,cons,feedback
0,4151950,11891,24353329,FEMALE,1984.0,BACHELORS,0,,0,,1,0,,,,,,2014-04-30 23:52:26.027,1,,4.0,3.0,5.0,3.0,2.0,3.0,YES,Same,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,You will be working with the most talented ppl...,Little bit politics in some teams.,
1,1863,11891,-1,,,,761,San Jose,2280,CA,1,35739,"Director, Product Management",product manager,0.913,product manager,0.913,2008-04-23 23:42:17.157,1,,5.0,4.0,4.5,5.0,4.5,,YES,,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,Freedom and responsibility. You're treated lik...,"Netflix is not for everyone. You don't get ""di...",I have none. Senior management is fantastic. s...
2,4991,11891,2076,,,,761,San Jose,2280,CA,1,13321,Marketing Manager,marketing manager,1.0,marketing manager,1.0,2008-06-11 00:03:28.907,1,,5.0,5.0,5.0,5.0,4.5,,YES,,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,Great colleagues -- incredible really,Domestic not global business -- wish we did eu...,"Focus on the customer, not on Apple"
3,53799,11891,68043,,,,700,Portland,3163,OR,1,64668,Support Staff,support staff,1.0,retail representative,1.0,2008-08-07 23:30:14.267,0,2008.0,2.0,1.0,4.5,4.0,5.0,,NO,,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,The upper management of Netflix really does se...,"Specific to the Hillsboro location, the middle...","To the senior-most management in Los Gatos, I ..."
4,53937,11891,68207,,,,0,,0,,1,36451,Does IT Matter?,,0.0,,0.0,2008-08-08 09:12:42.493,0,2008.0,2.0,2.0,2.5,3.5,1.0,,NO,,Approve,Netflix,NFLX,COMPANY_PUBLIC,4700,8830669000,Internet,Information Technology,"The people there are fantastic, the service is...",It's frustrating to work for direct management...,"Stop being so secretive, just be upfront and h..."


In [5]:
df['reviewDateTime'] = pd.to_datetime(df['reviewDateTime'])
df['year'] = df['reviewDateTime'].dt.year
df['year'].head()

0    2014
1    2008
2    2008
3    2008
4    2008
Name: year, dtype: int64

The following function will remove the company names from their respective reviews.

In [8]:
stopwords = nltk.corpus.stopwords.words('english')

def comp_name_out(data, col_to_search, col_reviews, companies):
    """
    This function takes in a dataframe, the name of the column with all of 
    the companies, the name of the column with the reviews, and an iterable
    with the companies names that are in the dataset. The latter could be a list,
    set, Series, tuple, etc.
    """
    for company in companies:
        condition = (data[col_to_search] == company)
        data.loc[condition, col_reviews] = data.loc[condition, col_reviews].str.lower().str.replace(company.lower(), '', regex=False)
    
    return data

Function to get the root of the word. You can get all three (lemma, stem, and snow) or use them separately with the partial functions below.

In [9]:
def root_of_word(docs, root_word_method='lemma'):
    porter_stemmer = nltk.stem.PorterStemmer()
    snowball_stemmer = nltk.stem.SnowballStemmer('english')
    lemma = nltk.wordnet.WordNetLemmatizer()
    
    tokens = nltk.word_tokenize(docs)
    
    if root_word_method == 'lemma':
        doc = ' '.join([lemma.lemmatize(w) for w in tokens])
    elif root_word_method == 'stemm':
        doc = ' '.join([porter_stemmer.stem(w) for w in tokens])
    elif root_word_method == 'snowball':
        doc = ' '.join([snowball_stemmer.stem(w) for w in tokens])
        
    return doc

stemming = partial(root_of_word, root_word_method='stemm')
snowball = partial(root_of_word, root_word_method='snowball')

The following function helps with the preprocessing of the data. It runs after the lemmatizer, stemmer, snowball, etc. If you want to include stopwords and take them out at a later stage, uncomment the first `filtered_tokens` below and comment out the second one.

In [10]:
def normalize_doc(doc):
    """
    This function normalizes your list of documents by taking only
    words, numbers, and spaces in between them. It then filters out
    stop words if you want to.
    """
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
#     filtered_tokens = [token for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopwords]
    doc = ' '.join(filtered_tokens)
    return doc

Functions to get the top topics and to run the LDA models.

In [11]:
def show_topics(vectorizer, lda_model, n_words=15):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

our_range = 2, 10, 50, 100, 150, 200, 250, 300

def get_models(topics, tf, tup_num):
    """
    This functions takes in the number of topics to run the model for,
    a tuple with the name of the company and the sparse matix and
    a number for the element in the tuple that has the sparse matix.
    It then returns a tuple with (company name, topic #, comph, and the model)
    """
    lda = LatentDirichletAllocation(n_components=topics, max_iter=100, learning_method='online', learning_offset=10., random_state=1234)
    lda_model = lda.fit(tf[tup_num])
    topicsOverWords = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
    return (tf[0], topics, comph(topicsOverWords), lda_model)

In [12]:
def jsd(p, q, base=np.e): # JS distance between probability vectors, used to compute compH
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    p = np.asarray(p)
    q = np.asarray(q)
    m = (1 / 2 * (p + q))
    return sp.stats.entropy(p, m, base) / 2 +  sp.stats.entropy(q, m, base) / 2

def conth(data): # function to measure content heterogeneity given a topic (prob) matrix
    return (1 / ((sum(map(sum, np.square(data.values)))) / data.shape[0]))

In [13]:
def comph(data): 
    #Transform probMatrix_df into 2D array
        
    df = pd.DataFrame()
    for x in range(len(data)): 
        jsd_list = []
        for y in range(len(data)): 
            jsd_list.append(jsd(data[x], data[y]))
        df[str(x)] = jsd_list

    #Get df lower diagonal
    mask = np.ones(df.shape, dtype='bool')
    mask[np.triu_indices(len(df))] = False
    df_lower_diagonal = df[(df>-1) & mask]
    
    distance_list = []
    for k in range(len(df)): 
    #Transform each column of df_lower_diagonal into list
        column_list = df_lower_diagonal[str(k)].values.tolist()
        #Drop nan values from column_list - to retain only actual values from lower diagonal 
        column_lower_diagonal_list = [l for l in column_list if (math.isnan(l) == False)]
        for d in column_lower_diagonal_list: 
            distance_list.append(d)
            
    return sum(distance_list) / float(len(distance_list))

Create an array with the unique employers in the dataset.

In [18]:
companies = df['employerName'].unique()
years = df['year'].unique()
years

array([2014, 2008, 2009, 2010, 2011, 2012, 2013, 2015, 2016, 2017, 2018])

Remove the company names from the reviews, and extract the reviews into a numpy array.

In [19]:
%%time

df = comp_name_out(df, 'employerName', 'pros', companies)
data_pros = df['pros'].values

CPU times: user 21.4 ms, sys: 642 µs, total: 22.1 ms
Wall time: 22.1 ms


The text preprocessing of the corpus takes place in parallel. You first normalize the reviews and then take the root of the words.

In [20]:
%%time

with cf.ProcessPoolExecutor() as e:
    data_pros_cleaned = e.map(normalize_doc, data_pros)
    data_pros_cleaned = list(e.map(stemming, data_pros_cleaned))

df['pros_clean'] = data_pros_cleaned

CPU times: user 1.15 s, sys: 248 ms, total: 1.4 s
Wall time: 2.48 s


Here you create an array with all of the companies and the amount of reviews they have. So far, only companies with at least 2 reviews make it to the modeling stage.

In [40]:
df['emp_year'] = df['employerName'] + '_' + df['year'].astype(str)
df['emp_year'].head()

0    Netflix_2014
1    Netflix_2008
2    Netflix_2008
3    Netflix_2008
4    Netflix_2008
Name: emp_year, dtype: object

In [42]:
comps_of_interest = df['emp_year'].value_counts()
comps_of_interest = (comps_of_interest).index
len(comps_of_interest), comps_of_interest

(11,
 Index(['Netflix_2015', 'Netflix_2017', 'Netflix_2016', 'Netflix_2014',
        'Netflix_2009', 'Netflix_2013', 'Netflix_2008', 'Netflix_2012',
        'Netflix_2010', 'Netflix_2011', 'Netflix_2018'],
       dtype='object'))

Select only the employers that meet the condition above by creating a boolean with True for yes and False for no.

In [43]:
cond2 = df['emp_year'].isin(comps_of_interest) # create the condition
df_interest = df[cond2].copy() # get the new dataset
unique_ids = df_interest['emp_year'].unique() # get the unique IDs or unique employers in the dataset
unique_ids

array(['Netflix_2014', 'Netflix_2008', 'Netflix_2009', 'Netflix_2010',
       'Netflix_2011', 'Netflix_2012', 'Netflix_2013', 'Netflix_2015',
       'Netflix_2016', 'Netflix_2017', 'Netflix_2018'], dtype=object)

The following loop will create sparse matrices for all companies and return a list of tuples with the name of the company, its sparse matrix, and the fitted vectorizer.

In [44]:
%%time

vectorizers_list = []
for comp_id in unique_ids:
    cond = (df_interest['emp_year'] == comp_id) # condition to get a dataset for each company
    revs_clean = df_interest.loc[cond, 'pros_clean'].values # get an array of reviews for such company
    count_vect = CountVectorizer() # instantiate a vectorizer
    vect = count_vect.fit_transform(revs_clean) # fit it to the selected company reviews
    vectorizers_list.append((comp_id, vect, count_vect))

CPU times: user 222 ms, sys: 0 ns, total: 222 ms
Wall time: 231 ms


Calculate the total words in the dictionary of review words, and get the percentage of words in the final dictionary that can be found in the full corpus.

In [45]:
len(vectorizers_list)

11

In [None]:
# TotalWords_vectorizer = CountVectorizer()
# TotalWords_tf = TotalWords_vectorizer.fit_transform(data_pros)
# totWords = len(TotalWords_vectorizer.get_feature_names())
# tf_vectorizer = CountVectorizer(max_df = 0.90, min_df=0.01)
# tf = tf_vectorizer.fit_transform(data_pros_cleaned)
# tf_feature_names = tf_vectorizer.get_feature_names()

# percVoc = len(tf_feature_names) / totWords * 100
# percVoc

Run the models in parallel and return a dictionary with the output of the get_models function for each company.

In [46]:
%%time

# bad_output = {}
output_dictionary = {} # dictionary for the output

for sparse_tup in vectorizers_list:
    # to run get models in parallel, some parameters have to be fixed into a new function
    # so we do that here with partial
    partial_func = partial(get_models, tf=sparse_tup, tup_num=1)
#     try:
        # you will run several topics at a time for each company
    with cf.ProcessPoolExecutor() as e:
        output = list(e.map(partial_func, our_range))
    output_dictionary[sparse_tup[0]] = output
#     except:
#         bad_tup = vectorizers_list.pop(vectorizers_list.index(sparse_tup))
#         bad_output[bad_tup[0]] = bad_tup

CPU times: user 383 ms, sys: 590 ms, total: 973 ms
Wall time: 8min 10s


In [27]:
# type(bad_output)

You will now iterate over the output from above, add each dataset into a list, and then concatenate them all into one dataset.

In [47]:
%%time

dfs_list = []
for data in output_dictionary.keys(): # you can use the keys to get the data
    temp_df = pd.DataFrame.from_dict(output_dictionary[data])
    dfs_list.append(temp_df)

output_dfs = pd.concat(dfs_list)
output_dfs.columns = ['company', 'topics', 'coherence', 'models']
output_dfs.head(10)

CPU times: user 39 ms, sys: 4.3 ms, total: 43.3 ms
Wall time: 49.5 ms


Unnamed: 0,company,topics,coherence,models
0,Netflix_2014,2,0.1284,LatentDirichletAllocation(learning_method='onl...
1,Netflix_2014,10,0.243416,LatentDirichletAllocation(learning_method='onl...
2,Netflix_2014,50,0.298834,LatentDirichletAllocation(learning_method='onl...
3,Netflix_2014,100,0.251752,LatentDirichletAllocation(learning_method='onl...
4,Netflix_2014,150,0.199517,LatentDirichletAllocation(learning_method='onl...
5,Netflix_2014,200,0.173241,LatentDirichletAllocation(learning_method='onl...
6,Netflix_2014,250,0.142165,LatentDirichletAllocation(learning_method='onl...
7,Netflix_2014,300,0.132144,LatentDirichletAllocation(learning_method='onl...
0,Netflix_2008,2,0.103315,LatentDirichletAllocation(learning_method='onl...
1,Netflix_2008,10,0.191917,LatentDirichletAllocation(learning_method='onl...


The following loop iterates over the new dataframe, searches for the top 2 topics based on highest coherence, and appends to a list a tuple containing the company, a tuple with the top two topic numbers, and the fitted vectorizer from the original `vectorizers_list`.

In [48]:
sorted_topics = []
for comp, tup in zip(unique_ids, vectorizers_list):
    condition = output_dfs['company'] == comp # get each company
    the_data = output_dfs[condition] # get an exclusive dataset for a company
    top_condition = the_data['coherence'].argsort() # get a sorted index based on coherence
    top_topics = the_data.loc[top_condition, 'topics'].values # get the sorted topics based on coherence
    sorted_topics.append((comp, (top_topics[-2], top_topics[-1]), tup[1])) # put all together

Now run the `get_models` function again over the new space of topics. You will  need to
1. sort the tuple with the top two topics.
2. create a linearly spaced array with 10 elements between the top 2 topics, turn it into integers, make the array a set to eliminate any duplicates that might arise if there is a 2 in the top two topics, and then turn that into a list.
3. get your fixed partial function again
4. the output is the same as before

In [49]:
%%time

output_dictionary2 = {}

for tup in sorted_topics:
    start, end = sorted(tup[1]) # since the top 2 topics might not be sorted, sort them first
    the_range = list(set(np.linspace(start, end, 10).astype(int))) 
    partial_func = partial(get_models, tf=tup, tup_num=2)
    with cf.ProcessPoolExecutor() as e:
        output = list(e.map(partial_func, the_range))
    output_dictionary2[tup[0]] = output

CPU times: user 274 ms, sys: 547 ms, total: 821 ms
Wall time: 2min 11s


Create multiple dataframes from dictionaries again and collapse them into 1.

In [50]:
%%time

dfs_list = []
for data in output_dictionary2.keys():
    temp_df = pd.DataFrame.from_dict(output_dictionary2[data])
    dfs_list.append(temp_df)

output_dfs = pd.concat(dfs_list)
output_dfs.columns = ['company', 'topics', 'coherence', 'models']
output_dfs.head(15)

CPU times: user 29.1 ms, sys: 3.19 ms, total: 32.2 ms
Wall time: 30.5 ms


Unnamed: 0,company,topics,coherence,models
0,Netflix_2014,66,0.291851,LatentDirichletAllocation(learning_method='onl...
1,Netflix_2014,100,0.251752,LatentDirichletAllocation(learning_method='onl...
2,Netflix_2014,72,0.274083,LatentDirichletAllocation(learning_method='onl...
3,Netflix_2014,77,0.277688,LatentDirichletAllocation(learning_method='onl...
4,Netflix_2014,50,0.298834,LatentDirichletAllocation(learning_method='onl...
5,Netflix_2014,83,0.259953,LatentDirichletAllocation(learning_method='onl...
6,Netflix_2014,55,0.288206,LatentDirichletAllocation(learning_method='onl...
7,Netflix_2014,88,0.248065,LatentDirichletAllocation(learning_method='onl...
8,Netflix_2014,61,0.272023,LatentDirichletAllocation(learning_method='onl...
9,Netflix_2014,94,0.252153,LatentDirichletAllocation(learning_method='onl...


Search for the best topic based on the new output, and get the top 10 words per topic. At the moment, you are only adding 1 of the topics for each company but you can change this by removing the indexing in `top_topics` below.

In [51]:
%%time

best_topics_model = defaultdict(tuple) # the output goes here

for company, vrizer in zip(unique_ids, vectorizers_list):
    cond = output_dfs['company'] == company # get each company
    filtered_data = output_dfs[cond] # to get a single dataframe
    the_topic = int(filtered_data.loc[filtered_data['coherence'].idxmax(), 'topics']) # get the best topic based on max coherence
    the_model = filtered_data.loc[filtered_data['coherence'].idxmax(), 'models'] # get the best model based on max coherence
    top_topics = show_topics(vrizer[2], the_model, 10) # get the top 10 words for each topic in each company
    best_topics_model[company] = (the_topic, the_model, top_topics[0])

CPU times: user 52.8 ms, sys: 7.65 ms, total: 60.4 ms
Wall time: 60 ms


Check out your output.

In [52]:
best_topics_model

defaultdict(tuple,
            {'Netflix_2014': (50,
              LatentDirichletAllocation(learning_method='online', max_iter=100,
                                        n_components=50, random_state=1234),
              array(['posit', 'polit', 'fellow', 'remark', 'sever', 'reason', 'could',
                     'product', 'improv', 'stretch'], dtype='<U13')),
             'Netflix_2008': (50,
              LatentDirichletAllocation(learning_method='online', max_iter=100,
                                        n_components=50, random_state=1234),
              array(['work', 'cultur', 'ego', 'peopl', 'number', 'quickli', 'passion',
                     'talent', 'address', 'respons'], dtype='<U14')),
             'Netflix_2009': (50,
              LatentDirichletAllocation(learning_method='online', max_iter=100,
                                        n_components=50, random_state=1234),
              array(['good', 'custom', 'standard', 'delight', 'ive', 'right', 'wors',
        

Get the probabilities dataframes for each company and add them to a dictionary.

In [53]:
#generate matrix summarizing distribution of docs (reviews) over topics
docs_of_probas = defaultdict(pd.DataFrame)

for tup in vectorizers_list:
    docs_of_probas[tup[0]] = pd.DataFrame(best_topics_model[tup[0]][1].transform(tup[1]))

Test it with any company.

In [None]:
docs_of_probas['Hays'].head()

### Calculate the measures of interest

In [54]:
%%time

comP_h_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    comP_h_results[company] = comph(proba_df.values)

CPU times: user 4.94 s, sys: 9.17 ms, total: 4.95 s
Wall time: 4.84 s


In [55]:
comP_h_results

defaultdict(float,
            {'Netflix_2014': 0.6159537428190394,
             'Netflix_2008': 0.6234709516953487,
             'Netflix_2009': 0.6179573645221192,
             'Netflix_2010': 0.6113292241115493,
             'Netflix_2011': 0.5765823150356264,
             'Netflix_2012': 0.5986994562659044,
             'Netflix_2013': 0.5915132386324783,
             'Netflix_2015': 0.5813203068464268,
             'Netflix_2016': 0.604695711450533,
             'Netflix_2017': 0.5739279502998358,
             'Netflix_2018': 0.577656344202352})

In [56]:
%%time


comT_h_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    comT_h_results[company] = conth(proba_df)

CPU times: user 17.2 ms, sys: 0 ns, total: 17.2 ms
Wall time: 17 ms


In [57]:
comT_h_results

defaultdict(float,
            {'Netflix_2014': 1.1888427351671123,
             'Netflix_2008': 1.145146248507382,
             'Netflix_2009': 1.1620342844458302,
             'Netflix_2010': 1.1736365931630586,
             'Netflix_2011': 1.1478207553814113,
             'Netflix_2012': 1.192052936611861,
             'Netflix_2013': 1.1880075318092609,
             'Netflix_2015': 1.2756316247431176,
             'Netflix_2016': 1.24874917605445,
             'Netflix_2017': 1.3057914470387517,
             'Netflix_2018': 1.2872432056066183})

In [58]:
def ent_avg(probMatrix):
    entropy_list = []
    for i in range(len(probMatrix)): 
        entropy_list.append(sp.stats.entropy(probMatrix[i]))
    return np.mean(entropy_list)

In [59]:
%%time

entropy_avg_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    entropy_avg_results[company] = ent_avg(proba_df.values)

CPU times: user 26.8 ms, sys: 247 µs, total: 27.1 ms
Wall time: 26.8 ms


In [60]:
entropy_avg_results

defaultdict(float,
            {'Netflix_2014': 0.5913163664254844,
             'Netflix_2008': 0.4724837369413857,
             'Netflix_2009': 0.5252152141081612,
             'Netflix_2010': 0.5136035961652106,
             'Netflix_2011': 0.42430919282915924,
             'Netflix_2012': 0.5703767278887092,
             'Netflix_2013': 0.5326530746454763,
             'Netflix_2015': 0.7587162691431096,
             'Netflix_2016': 0.713202232190446,
             'Netflix_2017': 0.7769405041498997,
             'Netflix_2018': 0.7687024017462769})

In [61]:
# function to compute the cross-entropy of two probability distributions
def cross_entropy(p, q):
    for i in range(len(p)):
        p[i] = p[i]+1e-12
    for i in range(len(q)):
        q[i] = q[i]+1e-12

    return -sum([p[i] * np.log2(q[i]) for i in range(len(p))])

# function to compute the average cross-entropy of a matrix
def avg_crossEnt(probMatrix): 
#    NOTE: Cross entropy is not symmetric. 
#    This function takes both cross-entropy(p,q) and cross-entropy(q,p) 
#    into account when computing the avg
    crossEntropy_list = []
    for i in range(len(probMatrix)):
        for j in range(len(probMatrix)): 
            if i != j:
                crossEntropy_list.append(cross_entropy(probMatrix[i], probMatrix[j]))
    return np.mean(crossEntropy_list)

In [62]:
%%time

cross_entropy_results = defaultdict(float)

for company, proba_df in docs_of_probas.items():
    cross_entropy_results[company] = avg_crossEnt(proba_df.values)
    
# avg_crossEnt(docs_topics_df.values)

CPU times: user 8.37 s, sys: 0 ns, total: 8.37 s
Wall time: 8.4 s


In [63]:
cross_entropy_results

defaultdict(float,
            {'Netflix_2014': 9.282826670317775,
             'Netflix_2008': 9.75645409295032,
             'Netflix_2009': 9.47084535777085,
             'Netflix_2010': 8.677540106866953,
             'Netflix_2011': 7.655200136887355,
             'Netflix_2012': 8.436132466440842,
             'Netflix_2013': 7.948402588834676,
             'Netflix_2015': 8.71140154771855,
             'Netflix_2016': 9.071257343746264,
             'Netflix_2017': 8.035673195990254,
             'Netflix_2018': 7.935449449271206})