In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from scipy.stats import skew, kurtosis

In [2]:
# Load datasets
metadata = pd.read_csv('C://Users//riyac//Downloads//data//data//SPGC-metadata-2018-07-18.csv')
kld_scores = pd.read_csv('C:/Users//riyac//Downloads//data//data//KLDscores.csv')

In [3]:
# Display initial information about the datasets
print("Metadata info before handling null values:")
print(metadata.info())
print("\nKLD Scores info before handling null values:")
print(kld_scores.info())

Metadata info before handling null values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57713 entries, 0 to 57712
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 57713 non-null  object 
 1   title              57642 non-null  object 
 2   author             55451 non-null  object 
 3   authoryearofbirth  42946 non-null  float64
 4   authoryearofdeath  41850 non-null  float64
 5   language           57711 non-null  object 
 6   downloads          57711 non-null  float64
 7   subjects           57713 non-null  object 
 8   type               57713 non-null  object 
dtypes: float64(3), object(6)
memory usage: 4.0+ MB
None

KLD Scores info before handling null values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23193 entries, 0 to 23192
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   filename    23193 non-nul

In [4]:
# Handling null values in metadata
# Drop rows where essential information (id, title) is missing
metadata.dropna(subset=['id', 'title'], inplace=True)

# Fill null values in numerical columns with their mean or median
numerical_cols = ['authoryearofbirth', 'authoryearofdeath', 'downloads']
for col in numerical_cols:
    if col in metadata.columns:
        metadata[col].fillna(metadata[col].median(), inplace=True)

In [5]:
# Handling null values in kld_scores
# Drop rows where essential information (filename, kld_values) is missing
kld_scores.dropna(subset=['filename', 'kld_values'], inplace=True)

# Convert kld_values from string representation of list to actual list
kld_scores['kld_values'] = kld_scores['kld_values'].apply(lambda x: eval(x) if pd.notnull(x) else [])

In [6]:
# Display information after handling null values
print("\nMetadata info after handling null values:")
print(metadata.info())
print("\nKLD Scores info after handling null values:")
print(kld_scores.info())


Metadata info after handling null values:
<class 'pandas.core.frame.DataFrame'>
Index: 57642 entries, 1 to 57712
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 57642 non-null  object 
 1   title              57642 non-null  object 
 2   author             55451 non-null  object 
 3   authoryearofbirth  57642 non-null  float64
 4   authoryearofdeath  57642 non-null  float64
 5   language           57641 non-null  object 
 6   downloads          57642 non-null  float64
 7   subjects           57642 non-null  object 
 8   type               57642 non-null  object 
dtypes: float64(3), object(6)
memory usage: 4.4+ MB
None

KLD Scores info after handling null values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23193 entries, 0 to 23192
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   filename    23193 non-null  obj

In [7]:
# Function to calculate book-level measures
def calculate_kld_measures(kld_values):
    kld_array = np.array(kld_values)
    measures = {}
    if len(kld_array) == 0:  # Handle case with empty kld_values
        measures['skewness_kld'] = np.nan
        measures['kurtosis_kld'] = np.nan
        measures['cumulative_kld'] = np.nan
        measures['rolling_mean_kld'] = np.nan
    else:
        measures['skewness_kld'] = skew(kld_array)
        measures['kurtosis_kld'] = kurtosis(kld_array)
        measures['cumulative_kld'] = np.sum(kld_array)
        measures['rolling_mean_kld'] = np.mean(pd.Series(kld_array).rolling(window=5, min_periods=1).mean())
    return measures

In [8]:
# Apply the function to calculate measures for each book
kld_measures = kld_scores['kld_values'].apply(lambda x: pd.Series(calculate_kld_measures(x)))

In [9]:
# Combine the new measures with the original kld_scores dataframe
kld_scores = pd.concat([kld_scores, kld_measures], axis=1)

In [10]:
# Merge with metadata
metadata = metadata.merge(kld_scores, left_on='id', right_on='filename', how='left')

In [11]:
# Display the result
print(metadata)

             id                                              title  \
0           PG1  The Declaration of Independence of the United ...   
1           PG2  The United States Bill of Rights: The Ten Orig...   
2           PG3                John F. Kennedy's Inaugural Address   
3           PG4  Lincoln's Gettysburg Address: Given November 1...   
4           PG5                     The United States Constitution   
...         ...                                                ...   
57637   PG57710                                 A Son of the State   
57638   PG57711  Hudson Tercentenary: An historical retrospect ...   
57639   PG57712                                     Proses moroses   
57640   PG57713                        The Animal Parasites of Man   
57641  PG999999                                      Piccole anime   

                                   author  authoryearofbirth  \
0                       Jefferson, Thomas             1743.0   
1                           Uni

In [12]:
# # Convert kld_values from string representation of lists to actual lists
# def convert_to_list(x):
#     if isinstance(x, str):
#         return np.array(eval(x))
#     return np.array(x)

In [15]:
kld_scores['kld_values'] = kld_scores['kld_values'].apply(convert_to_list)

NameError: name 'convert_to_list' is not defined

In [14]:
kld_scores['kld_values']

0        [0.22391005737243896, 0.24226261808703536, 0.2...
1        [0.24107767463211327, 0.24747085497572513, 0.2...
2        [0.2502283960399736, 0.2304129699198611, 0.238...
3        [0.2576982842724978, 0.2424932127358288, 0.220...
4        [0.25125974534678364, 0.23622148585532693, 0.2...
                               ...                        
23188    [0.22838257901564088, 0.21120893011566938, 0.2...
23189    [0.2471592500558816, 0.1970731579466416, 0.291...
23190    [0.21198661056119145, 0.21617505920334878, 0.2...
23191    [0.24116003079407344, 0.21510152162479515, 0.2...
23192    [0.20625582936128445, 0.20509986575511333, 0.1...
Name: kld_values, Length: 23193, dtype: object

In [12]:
# Calculate book-level measures
kld_summary = kld_scores.copy()
kld_summary['avg_KLD'] = kld_summary['kld_values'].apply(np.mean)
kld_summary['var_KLD'] = kld_summary['kld_values'].apply(np.var)
kld_summary['slope_KLD'] = kld_summary['kld_values'].apply(lambda x: np.polyfit(range(len(x)), x, 1)[0])

In [23]:
# Function to calculate rolling statistics
def rolling_statistics(kld_values, window=5):
    return pd.Series(kld_values).rolling(window).mean().tolist()

In [24]:
# Calculate skewness, kurtosis, cumulative KLD, and rolling statistics
def calculate_kld_metrics(kld_scores):
    kld_scores['skewness'] = kld_scores['kld_values'].apply(lambda x: skew(x))
    kld_scores['kurtosis'] = kld_scores['kld_values'].apply(lambda x: kurtosis(x))
    kld_scores['cumulative_kld'] = kld_scores['kld_values'].apply(lambda x: np.sum(x))
    kld_scores['rolling_mean_kld'] = kld_scores['kld_values'].apply(lambda x: rolling_statistics(x, window=5))
    return kld_scores

In [25]:
# Apply the function to your KLD scores DataFrame
kld_scores = calculate_kld_metrics(kld_scores)

In [16]:
# # Merge with metadata
# data = pd.merge(metadata, kld_summary[['id', 'avg_KLD', 'var_KLD', 'slope_KLD']], on='id')

KeyError: "['id'] not in index"

In [26]:
# Merging with metadata (assuming 'filename' is the common key)
full_data = kld_scores.merge(metadata, left_on='filename', right_on='id')

In [29]:
# Display the updated DataFrame with new metrics
print(full_data.head())

  filename                                         kld_values  skewness  \
0  PG10002  [0.22391005737243896, 0.24226261808703536, 0.2...  3.214062   
1  PG10005  [0.24107767463211327, 0.24747085497572513, 0.2...  1.876079   
2  PG10003  [0.2502283960399736, 0.2304129699198611, 0.238...  4.132600   
3  PG10008  [0.2576982842724978, 0.2424932127358288, 0.220...  5.612184   
4  PG10012  [0.25125974534678364, 0.23622148585532693, 0.2...  0.209351   

    kurtosis  cumulative_kld  \
0  14.083175       11.467601   
1   5.665046       11.924212   
2  16.760701       11.838955   
3  33.819186       11.767495   
4   0.697714       11.096964   

                                    rolling_mean_kld       id  \
0  [nan, nan, nan, nan, 0.23940459513169365, 0.23...  PG10002   
1  [nan, nan, nan, nan, 0.23598490054014315, 0.23...  PG10005   
2  [nan, nan, nan, nan, 0.23609171502086213, 0.22...  PG10003   
3  [nan, nan, nan, nan, 0.23145401581378824, 0.22...  PG10008   
4  [nan, nan, nan, nan, 0.23683

In [30]:
# Adding log(downloads) to the DataFrame
full_data['log_downloads'] = np.log(full_data['downloads'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [31]:
# Preparing the data for regression
X = full_data[['skewness', 'kurtosis', 'cumulative_kld']]  # Include other controls if available
X = sm.add_constant(X)  # Adds a constant term to the predictors
y = full_data['log_downloads']

In [32]:
# Running the regression
model = sm.OLS(y, X).fit()

In [33]:
# Display the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          log_downloads   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Fri, 28 Jun 2024   Prob (F-statistic):                nan
Time:                        17:35:40   Log-Likelihood:                    nan
No. Observations:               18988   AIC:                               nan
Df Residuals:                   18984   BIC:                               nan
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const                 nan        nan        

  return np.sum(weights * (model.endog - mean)**2)


In [37]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import skew, kurtosis

# Load datasets
metadata = pd.read_csv('C://Users//riyac//Downloads//data//data//SPGC-metadata-2018-07-18.csv')
kld_scores = pd.read_csv('C:/Users//riyac//Downloads//data//data//KLDscores.csv')



In [38]:
# Rename the 'filename' column in kld_scores to 'id' for consistency
kld_scores.rename(columns={'filename': 'id'}, inplace=True)

In [39]:
# Merge datasets on the 'id' column
merged_data = pd.merge(metadata, kld_scores, on='id', how='left')

In [40]:
print(merged_data.columns)
print(merged_data.shape)

Index(['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath',
       'language', 'downloads', 'subjects', 'type', 'kld_values'],
      dtype='object')
(57713, 10)


In [41]:
# Load the new dataset
additional_data = pd.read_csv('C://Users//riyac//Downloads//data//data//extra_controls.csv')  

# Merge with existing data
new_merged_data = pd.merge(merged_data, additional_data, on='id', how='left')

# Ensure all columns are numeric
new_merged_data = new_merged_data.apply(pd.to_numeric, errors='coerce')

print(new_merged_data.columns)
print(new_merged_data.shape)


Index(['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath',
       'language', 'downloads', 'subjects', 'type', 'kld_values', 'subj2_war',
       'subj2_adventure', 'subj2_comedy', 'subj2_biography', 'subj2_romance',
       'subj2_drama', 'subj2_fantasy', 'subj2_family', 'subj2_sciencefiction',
       'subj2_action', 'subj2_thriller', 'subj2_western', 'subj2_horror',
       'subj2_mystery', 'subj2_crime', 'subj2_history', 'subj2_periodicals',
       'subj2_others', 'speed', 'sentiment_avg', 'sentiment_vol', 'wordcount'],
      dtype='object')
(57713, 32)


In [49]:
genre_columns = [
    'subj2_war', 'subj2_adventure', 'subj2_comedy', 'subj2_biography', 'subj2_romance', 
    'subj2_drama', 'subj2_fantasy', 'subj2_family', 'subj2_sciencefiction', 'subj2_action', 
    'subj2_thriller', 'subj2_western', 'subj2_horror', 'subj2_mystery', 'subj2_crime', 
    'subj2_history', 'subj2_periodicals', 'subj2_others'
]

# Create a mask for the rows where 'downloads' is missing
missing_downloads_mask = new_merged_data['downloads'].isnull()

# Define the function to calculate the mean downloads for the genres
def get_genre_mean(row):
    genres = [col for col in genre_columns if row[col] == 1]
    if genres:
        genre_mean = new_merged_data[new_merged_data[genres].sum(axis=1) > 0]['downloads'].mean()
        return genre_mean
    else:
        return new_merged_data['downloads'].mean()
    
# Apply the function to rows with missing downloads
new_merged_data.loc[missing_downloads_mask, 'downloads'] = new_merged_data[missing_downloads_mask].apply(get_genre_mean, axis=1)

# Check for any remaining missing values
print(new_merged_data['downloads'].isnull().sum())

# Verify the imputed values
print(new_merged_data[missing_downloads_mask][['id', 'downloads']])

# # Define the function to calculate the mean downloads for the genres
# def get_genre_mean(row):
#     genres = [col for col in genre_columns if row[col] == 1]
#     if genres:
#         genre_mean = new_merged_data[new_merged_data[genres].sum(axis=1) > 0]['downloads'].mean()
#         return genre_mean
#     else:
#         return new_merged_data['downloads'].mean()

# # Apply the function to rows with missing downloads
# new_merged_data.loc[missing_downloads_mask, 'downloads'] = new_merged_data[missing_downloads_mask].apply(get_genre_mean, axis=1)

# # Check for any remaining missing values
# print(new_merged_data['downloads'].isnull().sum())

# # Verify the imputed values
# print(new_merged_data[missing_downloads_mask][['id', 'downloads']])

0
Empty DataFrame
Columns: [id, downloads]
Index: []


In [43]:
# Function to calculate KLD measures
def calculate_kld_measures(kld_list):
    kld_list = eval(kld_list)  # Convert string representation of list to actual list
    skewness_kld = skew(kld_list)
    kurtosis_kld = kurtosis(kld_list)
    cumulative_kld = np.sum(kld_list)
    rolling_mean_kld = pd.Series(kld_list).rolling(window=5, min_periods=1).mean().mean()
    
    return skewness_kld, kurtosis_kld, cumulative_kld, rolling_mean_kld

In [44]:
# Apply the function to calculate measures for each book
kld_measures = new_merged_data['kld_values'].dropna().apply(lambda x: calculate_kld_measures(x)).apply(pd.Series)
kld_measures.columns = ['skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld']

In [48]:
# Concatenate the KLD measures with the merged data
new_merged_data = pd.concat([new_merged_data, kld_measures], axis=1)

print(new_merged_data.columns)

Index(['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath',
       'language', 'downloads', 'subjects', 'type', 'kld_values', 'subj2_war',
       'subj2_adventure', 'subj2_comedy', 'subj2_biography', 'subj2_romance',
       'subj2_drama', 'subj2_fantasy', 'subj2_family', 'subj2_sciencefiction',
       'subj2_action', 'subj2_thriller', 'subj2_western', 'subj2_horror',
       'subj2_mystery', 'subj2_crime', 'subj2_history', 'subj2_periodicals',
       'subj2_others', 'speed', 'sentiment_avg', 'sentiment_vol', 'wordcount',
       'kld_values', 'kld_values', 'kld_values', 'kld_values'],
      dtype='object')


In [25]:
# Ensure all columns used in the regression are numeric
numeric_cols = ['downloads', 'skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld', 'authoryearofbirth', 'authoryearofdeath','speed','sentiment_avg', 'sentiment_vol', 'wordcount']
merged_data_clean = new_merged_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

KeyError: "['skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld'] not in index"

In [10]:
# # Drop rows with any NaN values in the columns of interest
# merged_data_clean = merged_data_clean.dropna() 


# merge missing download data with mean genre download value for the book

In [11]:
# Log-transform the 'downloads' column
merged_data_clean['log_downloads'] = np.log(merged_data_clean['downloads'] + 1)  # Adding 1 to avoid log(0)

In [36]:
merged_data_clean.shape

(14478, 8)

In [12]:
#Define the dependent variable and independent variables
y = merged_data_clean['log_downloads']
X = merged_data_clean[['skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld', 'authoryearofbirth', 'authoryearofdeath']]

In [10]:
# # Convert categorical variable 'language' to dummy variables
# language_dummies = pd.get_dummies(merged_data_clean['language'].apply(lambda x: x[0] if isinstance(x, list) else x), drop_first=True)
# X = pd.concat([X, language_dummies], axis=1)

In [13]:
# Add a constant to the independent variables
X = sm.add_constant(X)

In [14]:
# Check for non-numeric columns in X
print("Data types of X before conversion:")
print(X.dtypes)

Data types of X before conversion:
const                float64
skewness_kld         float64
kurtosis_kld         float64
cumulative_kld       float64
rolling_mean_kld     float64
authoryearofbirth    float64
authoryearofdeath    float64
dtype: object


In [15]:
# Convert all columns to numeric (if any are object type)
X = X.apply(pd.to_numeric, errors='coerce')

In [16]:
# Check for any remaining non-numeric columns
print("Data types of X after conversion:")
print(X.dtypes)

Data types of X after conversion:
const                float64
skewness_kld         float64
kurtosis_kld         float64
cumulative_kld       float64
rolling_mean_kld     float64
authoryearofbirth    float64
authoryearofdeath    float64
dtype: object


In [17]:
# Drop rows with any NaN values that may have been introduced during conversion
X = X.dropna()
# Ensure y matches the index of X after cleaning
y = y.loc[X.index]

In [18]:
# Print the first few rows of X and y to verify
print(X.head())
print(y.head())

     const  skewness_kld  kurtosis_kld  cumulative_kld  rolling_mean_kld  \
100    1.0      0.317161      0.013578       11.963364          0.246172   
102    1.0      1.397116      2.060044       13.668580          0.276907   
105    1.0      1.508529      3.563027       11.227382          0.227560   
106    1.0      1.671049      4.457829       11.352629          0.231256   
107    1.0      0.596233     -0.053075       11.220667          0.228607   

     authoryearofbirth  authoryearofdeath  
100             1564.0             1616.0  
102             1835.0             1910.0  
105             1775.0             1817.0  
106             1875.0             1950.0  
107             1840.0             1928.0  
100    8.229778
102    6.519147
105    7.929846
106    5.420535
107    6.320768
Name: log_downloads, dtype: float64


In [19]:
# Fit the regression model
model = sm.OLS(y, X).fit()

In [20]:
# Print the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          log_downloads   R-squared:                       0.037
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     92.99
Date:                Mon, 01 Jul 2024   Prob (F-statistic):          4.80e-115
Time:                        11:55:55   Log-Likelihood:                -21745.
No. Observations:               14478   AIC:                         4.350e+04
Df Residuals:                   14471   BIC:                         4.356e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 5.2682      0.17

In [30]:
# Load the new dataset
additional_data = pd.read_csv('C://Users//riyac//Downloads//data//data//extra_controls.csv')  

# Merge with existing data
new_merged_data = pd.merge(merged_data, additional_data, on='id', how='left')

# Ensure all columns are numeric
new_merged_data = new_merged_data.apply(pd.to_numeric, errors='coerce')

print(new_merged_data.columns)


Index(['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath',
       'language', 'downloads', 'subjects', 'type', 'kld_values',
       'skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld',
       'subj2_war', 'subj2_adventure', 'subj2_comedy', 'subj2_biography',
       'subj2_romance', 'subj2_drama', 'subj2_fantasy', 'subj2_family',
       'subj2_sciencefiction', 'subj2_action', 'subj2_thriller',
       'subj2_western', 'subj2_horror', 'subj2_mystery', 'subj2_crime',
       'subj2_history', 'subj2_periodicals', 'subj2_others', 'speed',
       'sentiment_avg', 'sentiment_vol', 'wordcount'],
      dtype='object')


In [31]:
# Prepare the data
# Log-transform the 'downloads' column
new_merged_data['log_downloads'] = np.log(new_merged_data['downloads'] + 1)  # Adding 1 to avoid log(0)

X = new_merged_data[['skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld', 
                 'authoryearofbirth', 'authoryearofdeath', 'speed', 'sentiment_avg', 
                 'sentiment_vol', 'wordcount'] + [col for col in new_merged_data.columns if col.startswith('subj2_')]]
X = sm.add_constant(X)  # Add constant term
y = new_merged_data['log_downloads']


In [33]:
# Check for inf or NaN values in X and handle them
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)

# Ensure y matches the index of X
y = y[X.index]

#impute to average value of downloads genrewise

In [34]:
# Fit the OLS regression model
model_with_additional_data = sm.OLS(y, X).fit()
print(model_with_additional_data.summary())

                            OLS Regression Results                            
Dep. Variable:          log_downloads   R-squared:                       0.167
Model:                            OLS   Adj. R-squared:                  0.164
Method:                 Least Squares   F-statistic:                     54.77
Date:                Mon, 01 Jul 2024   Prob (F-statistic):          5.64e-258
Time:                        13:01:05   Log-Likelihood:                -10366.
No. Observations:                7134   AIC:                         2.079e+04
Df Residuals:                    7107   BIC:                         2.097e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    6.7752 

In [35]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform LASSO regression with cross-validation
lasso = LassoCV(cv=5).fit(X_scaled, y)
lasso_coefficients = pd.Series(lasso.coef_, index=X.columns)

print("LASSO Coefficients:")
print(lasso_coefficients)

# Identify important predictors
important_predictors = lasso_coefficients[lasso_coefficients != 0].index.tolist()
print("Important Predictors identified by LASSO:")
print(important_predictors)


LASSO Coefficients:
const                   0.000000
skewness_kld           -0.110706
kurtosis_kld           -0.000000
cumulative_kld         -0.000000
rolling_mean_kld        0.000000
authoryearofbirth      -0.000000
authoryearofdeath      -0.132805
speed                  -0.077627
sentiment_avg          -0.141678
sentiment_vol           0.140141
wordcount               0.045047
subj2_war              -0.018928
subj2_adventure        -0.024160
subj2_comedy            0.000000
subj2_biography        -0.000000
subj2_romance           0.026423
subj2_drama             0.000000
subj2_fantasy           0.106527
subj2_family            0.000482
subj2_sciencefiction    0.136933
subj2_action           -0.000000
subj2_thriller          0.000000
subj2_western           0.000000
subj2_horror            0.103583
subj2_mystery           0.037017
subj2_crime             0.000000
subj2_history          -0.000000
subj2_periodicals       0.018021
subj2_others           -0.113777
dtype: float64
Importan

In [None]:
# *************************************************************************

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import skew, kurtosis



In [2]:
# Load datasets
metadata = pd.read_csv('C://Users//riyac//Downloads//data//data//SPGC-metadata-2018-07-18.csv')
kld_scores = pd.read_csv('C:/Users//riyac//Downloads//data//data//KLDscores.csv')



In [3]:
# Rename the 'filename' column in kld_scores to 'id' for consistency
kld_scores.rename(columns={'filename': 'id'}, inplace=True)
# Merge datasets on the 'id' column
merged_data = pd.merge(metadata, kld_scores, on='id', how='left')


In [53]:
# # Load the new dataset
# additional_data = pd.read_csv('C://Users//riyac//Downloads//data//data//extra_controls.csv')  

# # Merge with existing data
# new_merged_data = pd.merge(merged_data, additional_data, on='id', how='left')

# # Ensure all columns are numeric
# new_merged_data = new_merged_data.apply(pd.to_numeric, errors='coerce')



In [54]:
# genre_columns = [
#     'subj2_war', 'subj2_adventure', 'subj2_comedy', 'subj2_biography', 'subj2_romance', 
#     'subj2_drama', 'subj2_fantasy', 'subj2_family', 'subj2_sciencefiction', 'subj2_action', 
#     'subj2_thriller', 'subj2_western', 'subj2_horror', 'subj2_mystery', 'subj2_crime', 
#     'subj2_history', 'subj2_periodicals', 'subj2_others'
# ]

# # Create a mask for the rows where 'downloads' is missing
# missing_downloads_mask = new_merged_data['downloads'].isnull()

# # Define the function to calculate the mean downloads for the genres
# def get_genre_mean(row):
#     genres = [col for col in genre_columns if row[col] == 1]
#     if genres:
#         genre_mean = new_merged_data[new_merged_data[genres].sum(axis=1) > 0]['downloads'].mean()
#         return genre_mean
#     else:
#         return new_merged_data['downloads'].mean()

# # Apply the function to rows with missing downloads
# new_merged_data.loc[missing_downloads_mask, 'downloads'] = new_merged_data[missing_downloads_mask].apply(get_genre_mean, axis=1)

# # Check for any remaining missing values
# print(new_merged_data['downloads'].isnull().sum())

# # Verify the imputed values
# print(new_merged_data[missing_downloads_mask][['id', 'downloads']])



0
       id  downloads
0     NaN  50.717194
57712 NaN  50.717194


In [4]:
# Function to calculate KLD measures
def calculate_kld_measures(kld_list):
    kld_list = eval(kld_list)  # Convert string representation of list to actual list
    skewness_kld = skew(kld_list)
    kurtosis_kld = kurtosis(kld_list)
    cumulative_kld = np.sum(kld_list)
    rolling_mean_kld = pd.Series(kld_list).rolling(window=5, min_periods=1).mean().mean()
    
    return skewness_kld, kurtosis_kld, cumulative_kld, rolling_mean_kld



In [5]:
# Apply the function to calculate measures for each book
kld_measures = merged_data['kld_values'].dropna().apply(lambda x: calculate_kld_measures(x)).apply(pd.Series)
kld_measures.columns = ['skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld']



In [6]:
# Drop the repeated 'kld_values' column to avoid conflicts
merged_data.drop(columns=['kld_values'], inplace=True)



In [10]:
# Concatenate the new measures with the original DataFrame
merged_data = pd.concat([merged_data, kld_measures], axis=1)

# Display the updated DataFrame with new metrics
print(merged_data.columns)

# Drop duplicate 'kld_values' columns
merged_data = merged_data.loc[:, ~merged_data.columns.duplicated()]


Index(['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath',
       'language', 'downloads', 'subjects', 'type', 'skewness_kld',
       'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld'],
      dtype='object')


In [12]:
# Ensure all columns used in the regression are numeric
numeric_cols = ['downloads', 'skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld', 'authoryearofbirth', 'authoryearofdeath']
merged_data_clean = merged_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [17]:
# Load the new dataset
additional_data = pd.read_csv('C://Users//riyac//Downloads//data//data//extra_controls.csv')  

# Merge with existing data
new_merged_data = pd.merge(merged_data, additional_data, on='id', how='left')

# Ensure all columns are numeric
new_merged_data = new_merged_data.apply(pd.to_numeric, errors='coerce')

print(new_merged_data.columns)
print(new_merged_data.shape)


Index(['id', 'title', 'author', 'authoryearofbirth', 'authoryearofdeath',
       'language', 'downloads', 'subjects', 'type', 'skewness_kld',
       'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld', 'subj2_war',
       'subj2_adventure', 'subj2_comedy', 'subj2_biography', 'subj2_romance',
       'subj2_drama', 'subj2_fantasy', 'subj2_family', 'subj2_sciencefiction',
       'subj2_action', 'subj2_thriller', 'subj2_western', 'subj2_horror',
       'subj2_mystery', 'subj2_crime', 'subj2_history', 'subj2_periodicals',
       'subj2_others', 'speed', 'sentiment_avg', 'sentiment_vol', 'wordcount'],
      dtype='object')
(57713, 35)


In [18]:
genre_columns = [
    'subj2_war', 'subj2_adventure', 'subj2_comedy', 'subj2_biography', 'subj2_romance', 
    'subj2_drama', 'subj2_fantasy', 'subj2_family', 'subj2_sciencefiction', 'subj2_action', 
    'subj2_thriller', 'subj2_western', 'subj2_horror', 'subj2_mystery', 'subj2_crime', 
    'subj2_history', 'subj2_periodicals', 'subj2_others'
]

# Create a mask for the rows where 'downloads' is missing
missing_downloads_mask = new_merged_data['downloads'].isnull()

# Define the function to calculate the mean downloads for the genres
def get_genre_mean(row):
    genres = [col for col in genre_columns if row[col] == 1]
    if genres:
        genre_mean = new_merged_data[new_merged_data[genres].sum(axis=1) > 0]['downloads'].mean()
        return genre_mean
    else:
        return new_merged_data['downloads'].mean()

# Apply the function to rows with missing downloads
new_merged_data.loc[missing_downloads_mask, 'downloads'] = new_merged_data[missing_downloads_mask].apply(get_genre_mean, axis=1)

# Check for any remaining missing values
print(new_merged_data['downloads'].isnull().sum())

# Verify the imputed values
print(new_merged_data[missing_downloads_mask][['id', 'downloads']])



0
       id  downloads
0     NaN  50.717194
57712 NaN  50.717194


In [21]:
# Prepare the data
# Log-transform the 'downloads' column
new_merged_data['log_downloads'] = np.log(new_merged_data['downloads'] + 1)  # Adding 1 to avoid log(0)

X = new_merged_data[['skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld', 
                 'authoryearofbirth', 'authoryearofdeath', 'speed', 'sentiment_avg', 
                 'sentiment_vol', 'wordcount'] + [col for col in new_merged_data.columns if col.startswith('subj2_')]]
X = sm.add_constant(X)  # Add constant term
y = new_merged_data['log_downloads']

In [22]:
# Check for inf or NaN values in X and handle them
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)

# Ensure y matches the index of X
y = y[X.index]

In [23]:
# Fit the OLS regression model
model_with_additional_data = sm.OLS(y, X).fit()
print(model_with_additional_data.summary())

                            OLS Regression Results                            
Dep. Variable:          log_downloads   R-squared:                       0.167
Model:                            OLS   Adj. R-squared:                  0.164
Method:                 Least Squares   F-statistic:                     54.77
Date:                Mon, 01 Jul 2024   Prob (F-statistic):          5.64e-258
Time:                        22:11:05   Log-Likelihood:                -10366.
No. Observations:                7134   AIC:                         2.079e+04
Df Residuals:                    7107   BIC:                         2.097e+04
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    6.7752 

In [28]:
# Remove unwanted columns
columns_to_remove = ['authoryearofbirth', 'authoryearofdeath', 'wordcount','type','subj2_war', 'subj2_adventure', 'subj2_comedy', 'subj2_biography', 'subj2_romance', 
    'subj2_drama', 'subj2_fantasy', 'subj2_family', 'subj2_sciencefiction', 'subj2_action', 
    'subj2_thriller', 'subj2_western', 'subj2_horror', 'subj2_mystery', 'subj2_crime', 
    'subj2_history', 'subj2_periodicals', 'subj2_others']
data_for_model = new_merged_data.drop(columns=columns_to_remove)

In [29]:
print(data_for_model.columns)

Index(['id', 'title', 'author', 'language', 'downloads', 'subjects',
       'skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld',
       'speed', 'sentiment_avg', 'sentiment_vol', 'log_downloads'],
      dtype='object')


In [36]:
X = data_for_model[['skewness_kld', 'kurtosis_kld', 'cumulative_kld', 'rolling_mean_kld', 
                 'speed', 'sentiment_avg', 'sentiment_vol']]
X = sm.add_constant(X)  # Add constant term

# Ensure y matches the index of X
y = data_for_model['log_downloads']

In [38]:
# Check for inf or NaN values in X and handle them
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)

# Ensure y matches the index of X
y = y[X.index]

In [39]:
# Fit the OLS regression model
model_with_additional_data = sm.OLS(y, X).fit()
print(model_with_additional_data.summary())

                            OLS Regression Results                            
Dep. Variable:          log_downloads   R-squared:                       0.084
Model:                            OLS   Adj. R-squared:                  0.083
Method:                 Least Squares   F-statistic:                     111.4
Date:                Mon, 01 Jul 2024   Prob (F-statistic):          6.17e-157
Time:                        22:26:39   Log-Likelihood:                -12741.
No. Observations:                8534   AIC:                         2.550e+04
Df Residuals:                    8526   BIC:                         2.555e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                4.2951      0.244  

In [40]:
#LASSO Regularization

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform LASSO regression with cross-validation
lasso = LassoCV(cv=5).fit(X_scaled, y)
lasso_coefficients = pd.Series(lasso.coef_, index=X.columns)

print("LASSO Coefficients:")
print(lasso_coefficients)

# Identify important predictors
important_predictors = lasso_coefficients[lasso_coefficients != 0].index.tolist()
print("Important Predictors identified by LASSO:")
print(important_predictors)


LASSO Coefficients:
const               0.000000
skewness_kld       -0.114902
kurtosis_kld       -0.000000
cumulative_kld     -0.000000
rolling_mean_kld    0.007440
speed              -0.101239
sentiment_avg      -0.203645
sentiment_vol       0.202774
dtype: float64
Important Predictors identified by LASSO:
['skewness_kld', 'rolling_mean_kld', 'speed', 'sentiment_avg', 'sentiment_vol']
