In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from collections import Counter
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import Lasso

# Data Loading and Preprocessing

Loaded KLD Scores and the metadata for each book

In [None]:
#KLD scores dataset
df = pd.read_csv("KLDscores.csv")

In [None]:
df

Unnamed: 0,filename,kld_values
0,PG10002,"[0.22391005737243896, 0.24226261808703536, 0.2..."
1,PG10005,"[0.24107767463211327, 0.24747085497572513, 0.2..."
2,PG10003,"[0.2502283960399736, 0.2304129699198611, 0.238..."
3,PG10008,"[0.2576982842724978, 0.2424932127358288, 0.220..."
4,PG10012,"[0.25125974534678364, 0.23622148585532693, 0.2..."
...,...,...
23188,PG9971,"[0.22838257901564088, 0.21120893011566938, 0.2..."
23189,PG9993,"[0.2471592500558816, 0.1970731579466416, 0.291..."
23190,PG9983,"[0.21198661056119145, 0.21617505920334878, 0.2..."
23191,PG996,"[0.24116003079407344, 0.21510152162479515, 0.2..."


In [None]:
df.dtypes

filename      object
kld_values    object
dtype: object

In [None]:
#metadata csv (renamed file in g drive)
df2 = pd.read_csv("metadata.csv")

In [None]:
df2

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
0,PG0,,,,,,,set(),Text
1,PG1,The Declaration of Independence of the United ...,"Jefferson, Thomas",1743.0,1826.0,['en'],604.0,"{'United States -- History -- Revolution, 1775...",Text
2,PG2,The United States Bill of Rights: The Ten Orig...,United States,,,['en'],158.0,"{'Civil rights -- United States -- Sources', '...",Text
3,PG3,John F. Kennedy's Inaugural Address,"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,['en'],28.0,{'Presidents -- United States -- Inaugural add...,Text
4,PG4,Lincoln's Gettysburg Address: Given November 1...,"Lincoln, Abraham",1809.0,1865.0,['en'],55.0,{'Consecration of cemeteries -- Pennsylvania -...,Text
...,...,...,...,...,...,...,...,...,...
57708,PG57710,A Son of the State,"Ridge, W. Pett (William Pett)",,1930.0,['en'],0.0,set(),Text
57709,PG57711,Hudson Tercentenary: An historical retrospect ...,"Chamberlain, Frank",,,['en'],0.0,set(),Text
57710,PG57712,Proses moroses,"Gourmont, Remy de",1858.0,1915.0,['fr'],0.0,set(),Text
57711,PG57713,The Animal Parasites of Man,"Theobald, F. V.",,,['en'],0.0,set(),Text


In [None]:
df['filename'].isin(df2['id']).value_counts()

filename
True     18988
False     4205
Name: count, dtype: int64

In [None]:
merged = pd.merge(df, df2, left_on='filename', right_on='id', how = 'inner')

In [None]:
merged = merged.drop(['id'], axis = 1)

In [None]:
#merged metadata and KLD scores
merged

Unnamed: 0,filename,kld_values,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
0,PG10002,"[0.22391005737243896, 0.24226261808703536, 0.2...",The House on the Borderland,"Hodgson, William Hope",1877.0,1918.0,['en'],593.0,{'Science fiction'},Text
1,PG10005,"[0.24107767463211327, 0.24747085497572513, 0.2...",A Voyage to the Moon: With Some Account of the...,"Tucker, George",1775.0,1861.0,['en'],17.0,"{'Space flight to the moon -- Fiction', 'Scien...",Text
2,PG10003,"[0.2502283960399736, 0.2304129699198611, 0.238...","My First Years as a Frenchwoman, 1876-1879","Waddington, Mary King",1833.0,1923.0,['en'],11.0,"{'France -- History -- Third Republic, 1870-19...",Text
3,PG10008,"[0.2576982842724978, 0.2424932127358288, 0.220...",The Mystery,"White, Stewart Edward",1873.0,1946.0,['en'],47.0,{'Science fiction'},Text
4,PG10012,"[0.25125974534678364, 0.23622148585532693, 0.2...",The Mountains of California,"Muir, John",1838.0,1914.0,['en'],93.0,"{'Natural history -- California', 'Mountain ec...",Text
...,...,...,...,...,...,...,...,...,...,...
18983,PG9971,"[0.22838257901564088, 0.21120893011566938, 0.2...","The Dramatic Works of Gerhart Hauptmann, Volume I","Hauptmann, Gerhart",1862.0,1946.0,['en'],25.0,{'German drama -- Translations into English'},Text
18984,PG9993,"[0.2471592500558816, 0.1970731579466416, 0.291...",Captivating Mary Carstairs,"Harrison, Henry Sydnor",1880.0,1930.0,['en'],8.0,{'Fiction'},Text
18985,PG9983,"[0.21198661056119145, 0.21617505920334878, 0.2...",Wylder's Hand,"Le Fanu, Joseph Sheridan",1814.0,1873.0,['en'],28.0,"{'Inheritance and succession -- Fiction', 'Mis...",Text
18986,PG996,"[0.24116003079407344, 0.21510152162479515, 0.2...",Don Quixote,"Cervantes Saavedra, Miguel de",1547.0,1616.0,['en'],4257.0,{'Spain -- Social life and customs -- 16th cen...,Text


#Building book level measures of characteristics of KL Divergence
Characteristics considered:

1. **Average KL Divergence:** Represents the average amount of information revealed per section
2. **Standard Deviation of KL Divergence:** This measures the variability of information revelation across sections
3. **Slope KL Divergence:** Refers to the average rate of change in information revelation across different sections of a book.
4. **Maximum KL Divergence:** Identifies the section with the highest information revelation, potentially corresponding to a major plot twist or revelation.
5. **Number of Sections with High KL Divergence:** Counts how many sections fall above a particular threshold, indicating the frequency of significant information reveals.

In [None]:
#list of various characteristics of KL Divergence to be built for each book
avg_kld = []
std_kld = []
slope_kld = []
max_kld = []
num_high_kld = []
book_ids = []

In [None]:
for idx, row in merged.iterrows():
  id = row['filename']
  kld = np.array(eval(row['kld_values']))

  avg = np.mean(kld)
  std = np.std(kld)
  max = np.max(kld)

  threshold = np.percentile(kld, 90)
  num_high = np.sum(kld > threshold)

  #finding slope
  X = np.arange(len(kld)).reshape(-1, 1)
  y = kld
  reg = LinearRegression().fit(X, y)
  slope = reg.coef_[0]

  book_ids.append(id)
  avg_kld.append(avg)
  std_kld.append(std)
  slope_kld.append(slope)
  max_kld.append(max)
  num_high_kld.append(num_high)

In [None]:
kld_measures = pd.DataFrame({
    'filename': book_ids,
    'avg_kld': avg_kld,
    'std_kld': std_kld,
    'slope_kld': slope_kld,
    'max_kld': max_kld,
    'num_high_kld': num_high_kld
})

In [None]:
kld_measures

In [None]:
merged = pd.merge(merged, kld_measures, on = 'filename', how = 'inner')

In [None]:
merged

# Relating book level KLD measures with log(downloads) by regressing the measures against downloads


Included Language as a control variable, as downloads can also depend on the language the book was written in

In [None]:
merged["log_downloads"] = np.log10(merged["downloads"] + 1)

In [None]:
merged['language'].value_counts()

language
['en']                18955
['la', 'en']              6
['fr', 'en']              4
['es', 'en']              3
['en', 'la']              2
['en', 'de']              2
['de', 'en']              2
['zh', 'en']              1
['en', 'fr']              1
['en', 'eo']              1
['grc', 'en']             1
['en', 'el']              1
['es', 'en', 'fr']        1
['en', 'pt']              1
['enm']                   1
['en', 'tl', 'es']        1
['en', 'myn']             1
['en', 'cy']              1
['en', 'es']              1
['en', 'kha']             1
['en', 'ang']             1
Name: count, dtype: int64

In [None]:
merged['language'] = merged['language'].apply(eval)

In [None]:
merged[['language', 'avg_kld', 'std_kld', 'slope_kld', 'max_kld', 'log_downloads']]

Unnamed: 0,language,avg_kld,std_kld,slope_kld,max_kld,log_downloads
0,[en],0.234033,0.040543,0.000870,0.450747,2.773786
1,[en],0.243351,0.033243,0.000143,0.390158,1.255273
2,[en],0.241611,0.059772,0.001259,0.541167,1.079181
3,[en],0.240153,0.047394,0.000813,0.548940,1.681241
4,[en],0.226469,0.015966,0.000054,0.276054,1.973128
...,...,...,...,...,...,...
18983,[en],0.251995,0.041667,0.000261,0.446346,1.414973
18984,[en],0.234510,0.025177,0.000134,0.338172,0.954243
18985,[en],0.228159,0.028720,0.000612,0.406871,1.462398
18986,[en],0.208989,0.013774,-0.000162,0.252348,3.629206


In [None]:
# encoding the various languages
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(merged['language'])

In [None]:
feature_names = mlb.classes_

In [None]:
one_hot_df = pd.DataFrame(one_hot_encoded, columns=feature_names)
merged = pd.concat([one_hot_df, merged], axis=1)

In [None]:
merged.columns

Index(['ang', 'cy', 'de', 'el', 'en', 'enm', 'eo', 'es', 'fr', 'grc', 'kha',
       'la', 'myn', 'pt', 'tl', 'zh', 'filename', 'kld_values', 'title',
       'author', 'authoryearofbirth', 'authoryearofdeath', 'language',
       'downloads', 'subjects', 'type', 'avg_kld', 'std_kld', 'slope_kld',
       'max_kld', 'num_high_kld', 'log_downloads'],
      dtype='object')

In [None]:
merged.drop(['language'], axis = 1, inplace = True)

In [None]:
merged

Unnamed: 0,ang,cy,de,el,en,enm,eo,es,fr,grc,...,authoryearofdeath,downloads,subjects,type,avg_kld,std_kld,slope_kld,max_kld,num_high_kld,log_downloads
0,0,0,0,0,1,0,0,0,0,0,...,1918.0,593.0,{'Science fiction'},Text,0.234033,0.040543,0.000870,0.450747,5,2.773786
1,0,0,0,0,1,0,0,0,0,0,...,1861.0,17.0,"{'Space flight to the moon -- Fiction', 'Scien...",Text,0.243351,0.033243,0.000143,0.390158,5,1.255273
2,0,0,0,0,1,0,0,0,0,0,...,1923.0,11.0,"{'France -- History -- Third Republic, 1870-19...",Text,0.241611,0.059772,0.001259,0.541167,5,1.079181
3,0,0,0,0,1,0,0,0,0,0,...,1946.0,47.0,{'Science fiction'},Text,0.240153,0.047394,0.000813,0.548940,5,1.681241
4,0,0,0,0,1,0,0,0,0,0,...,1914.0,93.0,"{'Natural history -- California', 'Mountain ec...",Text,0.226469,0.015966,0.000054,0.276054,5,1.973128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18983,0,0,0,0,1,0,0,0,0,0,...,1946.0,25.0,{'German drama -- Translations into English'},Text,0.251995,0.041667,0.000261,0.446346,5,1.414973
18984,0,0,0,0,1,0,0,0,0,0,...,1930.0,8.0,{'Fiction'},Text,0.234510,0.025177,0.000134,0.338172,5,0.954243
18985,0,0,0,0,1,0,0,0,0,0,...,1873.0,28.0,"{'Inheritance and succession -- Fiction', 'Mis...",Text,0.228159,0.028720,0.000612,0.406871,5,1.462398
18986,0,0,0,0,1,0,0,0,0,0,...,1616.0,4257.0,{'Spain -- Social life and customs -- 16th cen...,Text,0.208989,0.013774,-0.000162,0.252348,5,3.629206


In [None]:
# taking books written in english as the control variable, along with other KLD measure as independent variables
X = merged[["en", "avg_kld", "std_kld", "max_kld", "slope_kld"]]

In [None]:
model = LinearRegression()
model.fit(X, merged["log_downloads"])

In [None]:
model.coef_

array([ -1.79797343,   0.34304126,   2.62231218,  -0.47832542,
       -31.48056701])

In [None]:
print('model intercept :', model.intercept_)
print('model coefficients : ', model.coef_)

model intercept : 3.2302154909425127
model coefficients :  [ -1.79797343   0.34304126   2.62231218  -0.47832542 -31.48056701]


**avg_kld:** *Coefficient: 0.34304126 (positive)*

This positive coefficient suggests that books with a higher average KL divergence tend to have more downloads, on average. This implies that readers might prefer books with a consistent flow of new information revealed throughout the sections

**std_kld:** *Coefficient: 2.62231218 (positive)*

This positive coefficient suggests that books with a higher standard deviation of KL divergence tend to be more popular. This indicates that readers might enjoy books that alternate between sections with high and low information reveal, keeping them engaged and surprised.

**max_kld:** *Coefficient: -0.47832542 (negative)*

This negative coefficient suggests that books with a very high maximum KL divergence (extreme information reveal in one section) might be slightly less popular on average. This could be because such a drastic information shift might disrupt the reading flow for some readers.

**slope_kld:** *Coefficient: -31.48056701 (negative)*

The slope of KL divergence might be highly correlated with other KLD measures (e.g., average KL divergence) in the data.

# Using LASSO to predict the most important variables that has significant effect on book downloads

In [None]:
merged['subjects']

0                                      {'Science fiction'}
1        {'Space flight to the moon -- Fiction', 'Scien...
2        {'France -- History -- Third Republic, 1870-19...
3                                      {'Science fiction'}
4        {'Natural history -- California', 'Mountain ec...
                               ...                        
18983        {'German drama -- Translations into English'}
18984                                          {'Fiction'}
18985    {'Inheritance and succession -- Fiction', 'Mis...
18986    {'Spain -- Social life and customs -- 16th cen...
18987                 {'English language -- Dictionaries'}
Name: subjects, Length: 18988, dtype: object

In [None]:
stop_words = {'and', 'of', 'the', 'is', 'was', 'to', 'in', 'on', 'for', 'with', 'by', 'a', 'an' ,'i', 'am', 'th', 'de'}

In [None]:
def tokenize_subjects(subjects, stop_words):
    subjects = [str(subject) for subject in subjects if isinstance(subject, str)]
    all_subjects = ' '.join(subjects)
    cleaned_subjects = re.sub(r'[^A-Za-z\s]', '', all_subjects)
    cleaned_subjects = cleaned_subjects.lower()
    tokens = cleaned_subjects.split()
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [None]:
all_subjects = merged['subjects'].apply(eval).explode().tolist()
tokens = tokenize_subjects(all_subjects, stop_words)

In [None]:
common_tokens = Counter(tokens).most_common(50)
print(common_tokens)


[('fiction', 19319), ('juvenile', 5743), ('history', 4097), ('stories', 2866), ('life', 2631), ('england', 2062), ('states', 1940), ('english', 1869), ('century', 1850), ('united', 1815), ('great', 1505), ('britain', 1469), ('travel', 1465), ('social', 1460), ('war', 1346), ('biography', 1280), ('description', 1255), ('customs', 1156), ('literature', 1126), ('periodicals', 992), ('women', 890), ('american', 853), ('france', 795), ('new', 700), ('world', 656), ('adventure', 630), ('conduct', 608), ('mystery', 561), ('translations', 545), ('into', 544), ('america', 521), ('poetry', 516), ('short', 504), ('science', 484), ('detective', 451), ('love', 442), ('criticism', 438), ('scotland', 425), ('young', 415), ('early', 405), ('indians', 396), ('york', 380), ('north', 367), ('politics', 365), ('government', 365), ('works', 354), ('western', 343), ('historical', 342), ('civil', 337), ('south', 336)]


In [None]:
def extract_genres(subjects, common_tokens):
    genres = set()
    for subject in subjects:
        for token, _ in common_tokens:
            if token in subject.lower():
                genres.add(token)
    return ', '.join(genres) if genres else 'Other'


In [None]:
merged['genres'] = merged['subjects'].apply(lambda x: extract_genres(eval(x), common_tokens))

In [None]:
genres_dummies = pd.get_dummies(merged["genres"])

In [None]:
kld_measures = ["avg_kld", "std_kld", "max_kld", "slope_kld"]
for kld_measure in kld_measures:
  for genre in genres_dummies.columns:
    merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[kld_measure] * genres_dummies[genre]
  merged[f"{kld_measure}*{genre}"] = merged[

Using LASSO for original feature set

In [None]:
 X_1 = merged[["en", "avg_kld", "std_kld", "max_kld", "slope_kld"]]

In [None]:
lasso_model_1 = Lasso(alpha=0.0001)
lasso_model_1.fit(X_1, merged["log_downloads"])

In [None]:
lasso_important_features_kld = [name for name, coef in zip(X_1.columns, lasso_model_1.coef_) if coef != 0]
print("Important Features according to LASSO:", lasso_important_features_kld)

Important Features according to LASSO: ['avg_kld', 'max_kld']


**Without Genre:**

Important Features: LASSO identified "avg_kld" (average information reveal) and "max_kld" (maximum information reveal in a section) as the most important features for predicting log(downloads). This suggests that overall information flow and moments of significant new information introduction might be important for book popularity, regardless of genre.

Using LASSO for KLD measures along with the genres

In [None]:
features = ["en", "avg_kld", "std_kld", "max_kld", "slope_kld"] + list(merged.filter(like='*'))
X = merged[features]

In [None]:
lasso_model = Lasso(alpha=0.0001)
lasso_model.fit(X, merged["log_downloads"])

In [None]:
lasso_important_features = [name for name, coef in zip(X.columns, lasso_model.coef_) if coef != 0]
print("Important Features according to LASSO:", lasso_important_features)

Important Features according to LASSO: ['max_kld', 'avg_kld*english', 'avg_kld*fiction', 'avg_kld*fiction, juvenile', 'avg_kld*periodicals', 'avg_kld*travel, description', 'max_kld*Other', 'max_kld*america, american', 'max_kld*america, indians, north', 'max_kld*american, social, america, biography', 'max_kld*biography', 'max_kld*britain, great', 'max_kld*britain, great, biography', 'max_kld*britain, great, history, conduct, england, fiction, life', 'max_kld*britain, history, great', 'max_kld*civil', 'max_kld*customs, social, life', 'max_kld*england', 'max_kld*english', 'max_kld*english, century', 'max_kld*english, into, translations', 'max_kld*english, poetry', 'max_kld*english, poetry, into, translations', 'max_kld*english, translations, into, literature', 'max_kld*fiction', 'max_kld*fiction, britain, history, great', 'max_kld*fiction, century, english', 'max_kld*fiction, conduct, juvenile, life', 'max_kld*fiction, customs, social, life', 'max_kld*fiction, france', 'max_kld*fiction, h

**Specific Genre Insights:**

***Max_kld Dominance:***

Interestingly, most significant features involve "max_kld" interacting with various genres. This suggests that for many genres (e.g., biography, history, fiction), moments of significant information introduction might be particularly important for downloads.

***Genre-Specific Patterns***:
Some specific interaction terms with "**avg_kld**" might be worth investigating further

 **avg_kld - description, travel**  - This could indicate that a consistent flow of new information is important for travel books with descriptions.

**avg_kld - fiction variations** - These might reveal how information reveal patterns differ across fiction subgenres (juvenile, mystery, etc.).