In [2]:
import pandas as pd
import ast

In [4]:
# Load in three subsets
subset_1 = pd.read_csv("data/fmc_subsets/subset_1.csv")
print(subset_1.shape)

subset_2 = pd.read_csv("data/fmc_subsets/subset_2.csv")
print(subset_2.shape)

subset_3 = pd.read_csv("data/fmc_subsets/subset_3.csv")
print(subset_3.shape)

(3050, 4)
(10950, 4)
(4004, 4)


In [8]:
# Concatenate subsets to one big data set
df_fmc_all = pd.concat([subset_1, subset_2, subset_3], ignore_index=True)
df_fmc_all

Unnamed: 0,title,author,answer,citation
0,Jackie Me Baseball Card Adventure 2,"Gutman, Dan","[0, """"]",['https://dangutman.com/dans-books/baseball-ca...
1,Honus Me A Baseball Card Adventure 1,"Gutman, Dan","[0, """"]",['https://www.goodreads.com/book/show/1000164....
2,Wildflower Hill,"Freeman, Kimberley","[1, ""Beattie Blaxland""]",['https://www.goodreads.com/book/show/10002296...
3,Oracle Night,"Auster, Paul","[1, ""Grace""]","['https://en.wikipedia.org/wiki/Oracle_Night',..."
4,As a Driven Leaf,"Steinberg, Milton","[0, """"]",['https://reformjudaism.org/driven-leaf-milton...
...,...,...,...,...
17999,Truly Madly Deeply,"Kazi, Faraaz","[1, ""Seema""]",['https://www.goodreads.com/book/show/9996645'...
18000,Sloop of War Richard Bolitho 6,"Kent, Alexander","[0, """"]",['https://www.goodreads.com/book/show/999733.S...
18001,Purification Autumn 3,"Moody, David","[0, """"]",['http://everythingalyce.blogspot.com/2016/04/...
18002,The American Heiress,"Goodwin, Daisy","[1, ""Cora Cash""]",['https://www.bookishwayfarer.com/blog/review-...


In [None]:
# Remove broken row
print(df_fmc_all["answer"].iloc[11449])
df_fmc_all = df_fmc_all.drop(index=11449).reset_index(drop=True)
print(df_fmc_all.shape)

(18003, 4)


In [None]:
# Parse answers
df_fmc_all["answer"] = df_fmc_all["answer"].apply(ast.literal_eval) # reponses column is string, convert to list

In [18]:
# Remove 3 invalid rose ('title' included several books so answers included several fmcs)

# Check which rows are NOT lists with exactly 2 items
invalid_rows_mask = df_fmc_all["answer"].apply(lambda x: not (isinstance(x, list) and len(x) == 2))

# View those rows
invalid_rows = df_fmc_all[invalid_rows_mask]

# Print or inspect
print(invalid_rows)

# Keeop only data where answer list has exactly 2 elements
df_fmc_all = df_fmc_all[df_fmc_all["answer"].apply(lambda x: isinstance(x, list) and len(x) == 2)]
print(df_fmc_all.shape)

                                                  title         author  \
3589  Looking for Alaska  An Abundance of Katherines...    Green, John   
4883                                       Auros paadas   Gary, Romain   
8485  Foundation  Foundation and Empire  Second Foun...  Asimov, Isaac   

                                                 answer  \
3589  [[1, Alaska Young], [0, ], [1, Margot Roth Spi...   
4883                                                [1]   
8485  [[0, ], [1, Bayta Darell], [1, Arcadia Darell]...   

                                               citation  
3589  ['https://www.sparknotes.com/lit/looking-for-a...  
4883                                                 []  
8485  ['http://phylobotanist.blogspot.com/2015/09/is...  
(18000, 4)


In [19]:
# Unnest answer column
df_fmc_all[["fmc_present", "fmc_name"]] = pd.DataFrame(df_fmc_all["answer"].tolist(), index=df_fmc_all.index)
df_fmc_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fmc_all[["fmc_present", "fmc_name"]] = pd.DataFrame(df_fmc_all["answer"].tolist(), index=df_fmc_all.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fmc_all[["fmc_present", "fmc_name"]] = pd.DataFrame(df_fmc_all["answer"].tolist(), index=df_fmc_all.index)


Unnamed: 0,title,author,answer,citation,fmc_present,fmc_name
0,Jackie Me Baseball Card Adventure 2,"Gutman, Dan","[0, ]",['https://dangutman.com/dans-books/baseball-ca...,0,
1,Honus Me A Baseball Card Adventure 1,"Gutman, Dan","[0, ]",['https://www.goodreads.com/book/show/1000164....,0,
2,Wildflower Hill,"Freeman, Kimberley","[1, Beattie Blaxland]",['https://www.goodreads.com/book/show/10002296...,1,Beattie Blaxland
3,Oracle Night,"Auster, Paul","[1, Grace]","['https://en.wikipedia.org/wiki/Oracle_Night',...",1,Grace
4,As a Driven Leaf,"Steinberg, Milton","[0, ]",['https://reformjudaism.org/driven-leaf-milton...,0,
...,...,...,...,...,...,...
17998,Truly Madly Deeply,"Kazi, Faraaz","[1, Seema]",['https://www.goodreads.com/book/show/9996645'...,1,Seema
17999,Sloop of War Richard Bolitho 6,"Kent, Alexander","[0, ]",['https://www.goodreads.com/book/show/999733.S...,0,
18000,Purification Autumn 3,"Moody, David","[0, ]",['http://everythingalyce.blogspot.com/2016/04/...,0,
18001,The American Heiress,"Goodwin, Daisy","[1, Cora Cash]",['https://www.bookishwayfarer.com/blog/review-...,1,Cora Cash


In [None]:
# Keep rows where fmc_name is not an empty string
df_fmc_final = df_fmc_all[df_fmc_all["fmc_name"].str.strip() != ""]
print(df_fmc_final.shape)
df_fmc_final

(14645, 6)

In [21]:
# Save data frame with fmcs only as csv file
df_fmc_final.to_csv('data/df_fmc_final.csv', index = False)