# Analysis: Level of Abstraction

Using Wordnet, I will calculate the average level of abstraction of each description (estimated by the number of hypernyms of each word), and see if this changes across different time period or different culture.

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
pd.options.display.max_columns = None
%matplotlib inline

In [2]:
df = pd.read_pickle("pickles/cleaned_df.pkl")

# Cleaning: macro culture

Adding a new column to categorize the culture

In [12]:
cnd = [
    df.culture.isin(['Korean', 'Japanese', 'Chinese', 'Tibetan', 'Thai']),
    (df.culture.str.contains("Italian") | df.culture.isin(['Minoan', 'German', 'British', 'Roman', 'French', 
                                                           'Spanish', 'Flemish?', 'European', 
                                                          'Dutch', 'Greek', 'Irish', 'English', 'Russian'])),
    df.culture.isin(['American', 'Canadian']), 
    df.culture.isin(['Indian', 'Mughal'])]
vals = ['East_Asian', 'European', 'North_American', 'South_Asian']
df['macro_culture'] = np.select(cnd, vals, default='others')

In [13]:
df.macro_culture.value_counts('macro_culture')

South_Asian       0.318565
East_Asian        0.308017
North_American    0.225738
European          0.105485
others            0.042194
Name: macro_culture, dtype: float64

# Cleaning: periods
Cleaning up time periods

In [14]:
# dealing with '0' values in date end
cnd = [
    ((df.dateend == 0) & (df.century.isin(["19th-20th century", "20th century"]))),
    ((df.dateend == 0) & (df.century.isin(["19th century", "18th-19th century"]))),
    ((df.dateend == 0) & (df.century.isin(["18th century"]))),
    ((df.dateend == 0) & (df.century.isin(["17th-16th century BCE", "17th century", "16th-17th century"]))),
    ((df.dateend == 0) & (df.century.isin(["16th century"]))),
    ((df.dateend == 0) & (df.century.isin(["15th century"]))), 
    ((df.dateend == 0) & (df.century.isin(["2nd century CE"])))]
vals = [1900, 1800, 1700, 1600, 1500, 1500, 200]
df['date'] = np.select(cnd, vals, default=df.dateend)

In [23]:
# re-binning for each time period
bins = [-2000, 1500, 1700, 1800, 1860, 1900, 2000, 2020]
df['date_bins'] = pd.cut(df['date'], bins, labels = ["Pre_15th", "16th-17th", "18th", "19th_1st", "19th_2nd", "20th", "21st"])

In [26]:
df.date_bins.value_counts('date_bins')

19th_2nd     0.221519
20th         0.217300
16th-17th    0.175105
18th         0.154008
19th_1st     0.118143
Pre_15th     0.073840
21st         0.040084
Name: date_bins, dtype: float64

# Hypenym Level
Calculating the level of hypernym for each unique word using Wordnet corpus

## Hypernym count data
First, making a dictionary with the number of hypernym

In [42]:
allwords = set([x for sub in df.description for x in sub]) # all unique value of nested list
len(allwords)

4385

In [43]:
from nltk.corpus import wordnet as wn

In [206]:
def hypernym (x):
    return x.hypernyms()

def find_hyp_level(word):
    skipped = {}
    n1 = wn.synsets(word, pos=wn.NOUN) # noun only
    if len(n1) == 0 : # if the word doesn't exist in Wordnet, return -1        
        skipped[word] = wn.morphy(word) # skipped words and its morphed form
        return np.nan
    
    elif len(n1) == 1 : # if there's only one synset, take the length of all hypernym
        return len(list(n1[0].closure(hypernym)))
    
    else : # if there's more than one synset, take an average of all options 
        min_ = min([len(list(x.closure(hypernym))) for x in n1])
        if min_ == 0: # if 0, it's likely they are proper nouns, therefore most concrete
            return -1
        else: return min_


In [207]:
testwords = ['apple', 'fruit', 'food', 'matter', 'chocolate', 'cake', 'coin', 
             'cloud', 'happiness', 'sky', 'joy', 'mountain', 'landscape', 'alps', 'abstraction', 'light']
{k : find_hyp_level(k) for k in testwords}

{'apple': 13,
 'fruit': 7,
 'food': 4,
 'matter': 2,
 'chocolate': 5,
 'cake': 6,
 'coin': 8,
 'cloud': 3,
 'happiness': 5,
 'sky': 6,
 'joy': 6,
 'mountain': 5,
 'landscape': 7,
 'alps': -1,
 'abstraction': 1,
 'light': 5}

In [327]:
hyp_dict = {k : find_hyp_level(k) for k in allwords}

In [328]:
hyp_df = pd.DataFrame.from_dict(hyp_dict, 'index', columns = ['value']) # for easy data manipulation

In [329]:
hyp_df = hyp_df.replace(-1, max(hyp_df.value)+1)
#hyp_df.reset_index(inplace = True)
#hyp_df.rename(columns = {'index': 'words'}, inplace = True)
#hyp_df = hyp_df[hyp_df.value != -1]

In [330]:
def hyp_score(list_):
    return np.nansum([hyp_df.loc[word] for word in list_])/len(list_)
    #return np.nansum([hyp_df.value[hyp_df.words == word] for word in list_])/len(list_)

In [331]:
df['hyp_score'] = df.description.apply(lambda x: hyp_score(x))

### Limitations / future direction
Currently I'm just averaging all occurrence in WordNet, this can be problematic for words that have multiple meanings especially if the abstraction level of each meaning is far apart. I also used only nouns, since different parts of speech have different level of hyper/hyponyms to gauge level of categorization accurately. In this process, some verbs that take same form as the noun cannot be filtered, possibly adding noise to our data.

# Average abstraction per culture
Hypothetically the level of hypernym is on average less for East Asian culture.

In [332]:
df.groupby('macro_culture').hyp_score.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
macro_culture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
East_Asian,146.0,5.323014,1.194164,2.0,4.740439,5.319338,5.896259,10.142857
European,42.0,5.445204,1.770812,1.666667,4.37766,5.45,6.4,9.5
North_American,100.0,5.71129,2.721985,0.0,3.875,5.565034,7.145833,13.0
South_Asian,151.0,5.933817,0.936742,3.254237,5.375,5.784615,6.583333,8.171429
others,20.0,4.957926,1.271037,2.75,4.370629,4.701389,5.583333,7.142857


In [333]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats as sts

In [334]:
df_cl = df.dropna(subset = ['hyp_score', 'macro_culture'], axis = 0)
lm = ols('hyp_score ~ C(macro_culture)', data=df_cl).fit()

sm.stats.anova_lm(lm, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
C(macro_culture),38.511221,4.0,3.550301,0.00725
Residual,1231.169797,454.0,,


In [335]:
tukey_ = sts.multicomp.pairwise_tukeyhsd(df_cl.hyp_score, df_cl.macro_culture)
tukey_._results_table

group1,group2,meandiff,p-adj,lower,upper,reject
East_Asian,European,0.1222,0.9,-0.6675,0.9119,False
East_Asian,North_American,0.3883,0.366,-0.1972,0.9737,False
East_Asian,South_Asian,0.6108,0.0129,0.0873,1.1343,True
East_Asian,others,-0.3651,0.8774,-1.4404,0.7103,False
European,North_American,0.2661,0.9,-0.5632,1.0954,False
European,South_Asian,0.4886,0.4361,-0.2982,1.2754,False
European,others,-0.4873,0.7872,-1.7126,0.738,False
North_American,South_Asian,0.2225,0.8104,-0.359,0.804,False
North_American,others,-0.7534,0.3365,-1.8581,0.3514,False
South_Asian,others,-0.9759,0.0945,-2.0491,0.0973,False


# Average abstraction per time period
Hypothetically the level of hypernym should get much lower around the industrial revolution and on when the abstract art started to happen.

In [336]:
df.groupby('date_bins').hyp_score.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date_bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Pre_15th,34.0,4.982248,1.622381,1.666667,3.97,5.28908,5.91315,7.666667
16th-17th,78.0,5.440267,1.305792,2.0,4.853821,5.463186,5.970486,10.142857
18th,73.0,5.772908,1.394005,1.5,4.980392,5.626263,6.823529,9.5
19th_1st,54.0,6.0017,1.184926,3.666667,5.372262,6.061947,6.622712,8.357143
19th_2nd,104.0,5.538377,1.964474,0.0,4.5,5.481959,6.690382,13.0
20th,97.0,5.742986,2.028446,1.5,4.5,5.5,6.387097,11.666667
21st,19.0,5.254755,0.888468,3.826087,4.990253,5.386256,5.521021,7.956522


In [337]:
df_cl = df.dropna(subset = ['hyp_score', 'date_bins'], axis = 0)
lm = ols('hyp_score ~ C(date_bins)', data=df_cl).fit()

sm.stats.anova_lm(lm, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
C(date_bins),30.497311,6.0,1.854014,0.087213
Residual,1239.183707,452.0,,


In [338]:
tukey_2 = sts.multicomp.pairwise_tukeyhsd(df_cl.hyp_score, df_cl.date_bins)
tukey_2._results_table

group1,group2,meandiff,p-adj,lower,upper,reject
16th-17th,18th,0.3326,0.8733,-0.4659,1.1312,False
16th-17th,19th_1st,0.5614,0.4725,-0.3067,1.4295,False
16th-17th,19th_2nd,0.0981,0.9,-0.6364,0.8326,False
16th-17th,20th,0.3027,0.8918,-0.4431,1.0485,False
16th-17th,21st,-0.1855,0.9,-1.4401,1.0691,False
16th-17th,Pre_15th,-0.458,0.8075,-1.4658,0.5497,False
18th,19th_1st,0.2288,0.9,-0.6514,1.109,False
18th,19th_2nd,-0.2345,0.9,-0.9833,0.5142,False
18th,20th,-0.0299,0.9,-0.7897,0.7299,False
18th,21st,-0.5182,0.8842,-1.7811,0.7448,False


# Check
Since we don't yet have enough data to infer the level of abstraction of each artwork, group artworks based on the average level of hypernyms and sample a few from each group and look at the images.