# Exploratory Data Analyses
Here I explore the text data we cleaned in Harvard_data. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
df = pd.read_csv('data/clean_df.csv', index_col = 0)


In [30]:
# removing columns where more than 50% of data are missing info
df = df[df.columns[df.isnull().sum()<(len(df)*0.5)]]

In [36]:
pd.options.display.max_columns = None

In [41]:
# removing some other information we don't need
df = df[['id', 'objectnumber', 'title', 'description', 'people', 'colorcount',  'colors', 
    'datebegin', 'dateend','century','medium', 'culture', 'department', 'division']]

# Cultural differences in description
First let's see if words used to describe paintings for different cultures are unique. 

In [55]:
#df.culture.value_counts('culture')

For the sake of keeping enough data within each bin, I'll combine some of these culture to larger groups.  


In [53]:
cnd = [
    df.culture.isin(['Korean', 'Japanese', 'Chinese', 'Tibetan', 'Thai']),
    (df.culture.str.contains("Italian") | df.culture.isin(['Minoan', 'German', 'British', 'Roman', 'French', 
                                                           'Spanish', 'Flemish?', 'European', 
                                                          'Dutch', 'Greek', 'Irish', 'English'])),
    df.culture.isin(['American', 'Canadian']), 
    df.culture.isin(['Indian', 'Mughal'])]
vals = ['East_Asian', 'European', 'North_American', 'Indian']
df['macro_culture'] = np.select(cnd, vals, default='others')

In [57]:
df.macro_culture.value_counts('macro_culture')

East_Asian        0.341113
Indian            0.314183
North_American    0.195691
European          0.086176
others            0.062837
Name: macro_culture, dtype: float64

## Word Frequencies
I'll create a few functions to count the word frequencies and to plot out a word cloud.

In [None]:
from wordcloud import WordCloud

In [None]:
def count_words(list_):
    """
    input: list of preprocessed strings (refer to data preprocessing)
    """
    wordlist = set_keywords(list_)
    uniques = set(wordlist)
    counts = dict.fromkeys(uniques, 0)
    for word in wordlist:
        counts[word] += 1
    return {k: v for k, v in sorted(counts.items(), 
                                    reverse = True, key=lambda item: item[1])}
    

In [None]:
abstract_df = clean_df[(clean_df.description.apply(lambda x: 'abstract' in x))]

In [None]:
words = sum(abstract_df.description, [])
unique = set(words)
counts = dict.fromkeys(unique, 0)
for w in words: 
    counts[w] += 1

In [None]:
counts = {k: v for k, v in sorted(counts.items(), reverse = True, key=lambda item: item[1])}