In this work I'll train some plotnine techniques. May be you will learn something from here too.

In [None]:
import pandas as pd
import numpy as np
from plotnine import *

In [None]:
artists = pd.read_csv('../input/museum-collection/artists.csv')
artworks = pd.read_csv('../input/museum-collection/artworks.csv')

In [None]:
artists['Gender'] = artists['Gender'].apply(lambda x:'Male' if x== 'male' else x)

In [None]:
print('Artists lost data:')
for i in artists.columns:
    print(i,f'{100*artists[i].isnull().sum()/artists.shape[0]:.2f}%')
print('| | | | | | |')
print('| | | | | | |')
print('| | | | | | |')
print('Artworks lost data:')
for i in artworks.columns:
    print(i,f'{100*artworks[i].isnull().sum()/artworks.shape[0]:.2f}%')


## Let's work with artists table

In [None]:
artists.head()

In [None]:
artists['Age'] = artists['Death Year'] - artists['Birth Year']

In [None]:
#We plot Age column vs Birth Year
artists_no_missing = artists.dropna()
(
    ggplot(artists_no_missing)
    +aes('Birth Year','Age')
    +geom_point()
    +ggtitle('The dependence of age on birth year')
)

Some people lived more than 100 years. Who are they?

In [None]:
artists.loc[artists['Age']>=100].sort_values(by="Age",ascending=False).head(5)

We can see that Union Paper Bag Machine Company existed for 130 years. Then we have artists, who also lived a long life.

What if we want to see the plot with hue of top nationalities?

In [None]:
top_nationalities = list(artists_no_missing['Nationality'].value_counts()[0:5].index)

In [None]:
artists_nationalities = artists_no_missing[artists_no_missing['Nationality'].\
                                           isin(top_nationalities)]
(
    ggplot(artists_nationalities)
    +aes(color='Nationality')
    +aes('Birth Year','Age')
    +geom_point()
    #+stat_smooth()
    +ggtitle('Dependence of age on nationality')
)

In [None]:
#By plotnine we can easily divide the plot's part into the rows. Each row creating its own 
#layer.Here we plot Age vs Birth year based on Nationality factor.

(
    ggplot(artists_nationalities)
    +aes(color='Nationality')
    +aes('Birth Year','Age')
    +geom_point()
    #+stat_smooth()
    + facet_wrap('~Nationality')
    +ggtitle('Age and Birth year depdnence on nationality')
)

In [None]:
(
    ggplot(artists_nationalities)
    +aes(color='Gender')
    +aes('Birth Year','Age')
    +geom_point()
    #+stat_smooth()
    + facet_wrap('~Gender')
    +ggtitle('Age gender dependence')
)

### We can define what sex at what age dies more. We will need to define top countries again after that.

In [None]:
artists_nationalities_new = artists_no_missing[artists_no_missing['Nationality'].\
                                           isin(top_nationalities)]

In [None]:
(
    ggplot(artists_nationalities_new)
    #+aes(color='Gender')
    +aes('Gender',fill='Nationality')
    #+aes('Birth Year','Age')
    +geom_bar()
    #+stat_smooth()
    #+ facet_wrap('~Gender')
    +ggtitle('Proportion of male and female artists based on nationality')
)

In [None]:
(
    ggplot(artists_nationalities_new)
    #+aes(color='Gender')
    +aes('Gender','Age',fill='Gender')
    #+aes('Birth Year','Age')
    +geom_violin()
    #+stat_smooth()
    #+ facet_wrap('~Gender')
    #+geom_point()
    +ggtitle('Most found age of artists')
)

In [None]:
(
    ggplot(artists_no_missing)
    +aes('Age',fill='Gender')
    +facet_wrap('~Gender')
    +geom_bar()
    +ggtitle('Male and Female Counts')
)

In [None]:
print(artists_no_missing['Age'].loc[artists_no_missing['Gender']=='Female'].mean())
print(artists_no_missing['Age'].loc[artists_no_missing['Gender']=='Male'].mean())

#### As we see the average age of female artists is 3.03 years longer than of male ones.

## Artworks set

In [None]:
artwork_columns = list(artworks.columns)
artworks.shape

In [None]:
for i in artworks.columns:
    print(i,f'{100*artworks[i].isnull().sum()/artworks.shape[0]:.2f}%')

6 features don't have more than 91% of data. Let's drop them.

In [None]:
artworks.drop(['Diameter (cm)','Circumference (cm)','Length (cm)',\
                             'Depth (cm)','Weight (kg)','Duration (s)'],axis =1,inplace = True)

In [None]:
artworks['Classification'].value_counts()

In [None]:
#Here I used the method of  Lj Miranda
#https://www.kaggle.com/ljvmiranda/dimensions-of-2d-artworks-in-moma
#We choose Photograph, Painting and Drawing class of artworks Classification feature and
#will plot their characteristics

artworks_photo = artworks[artworks['Classification']=='Photograph']
artworks_painting = artworks[artworks['Classification']=='Painting']
artworks_drawing = artworks[artworks['Classification']=='Drawing']

all_2D = [artworks_photo,artworks_painting,artworks_drawing]
artworks_2D = pd.concat(all_2D,axis = 0,ignore_index = True)

In [None]:
artworks_2D = artworks_2D[['Title','Classification','Height (cm)','Width (cm)']]
artworks_2D = artworks_2D.rename(columns = {'Height (cm)':'Height','Width (cm)':'Width'})
artworks_2D = artworks_2D.dropna()

In [None]:
artworks_2D[artworks_2D['Height']==0].shape
artworks_2D['Height'].min()
artworks_2D.isnull().sum()
artworks_2D.head()

In [None]:
artworks_2D = artworks_2D[(artworks_2D.Width != 0) & (artworks_2D.Height != 0)]

In [None]:
#Height and Width features have broad range. We will scale them by the decimal logarithm.
# Plot 
ratio =  np.log10(artworks_2D['Height'])/np.log10(artworks_2D['Width'])
#print(min(ratio))
width = np.log10(artworks_2D['Width'])
#print(min(width))
artworks_2D['log_ratio'] = ratio
artworks_2D['log_width'] = width
# 4/3
four_thirds = np.log10(4)/np.log10(3)
three_fourths = np.log10(3)/np.log10(4)

In [None]:
(
    ggplot(artworks_2D)
    +aes('log_width','log_ratio',color='Classification')
    +facet_wrap('~Classification')
    +geom_point(show_legend = True)#color ='c')
    #+scale_color_cmap(cmap_name='viridis')
    +geom_hline(yintercept = 1, color='black')
    +geom_hline(yintercept = three_fourths, color='purple')
    +geom_hline(yintercept = four_thirds, color='purple')
    +labs()
    +xlim(0.5,3.5)  #Warning is because of axis limits we give
    +ylim(0,5)
    +ggtitle('Dimension of 2D artworks')
)

## Clustering

#### Elbow method

In [None]:
# Code is taken from here 
#https://medium.com/analytics-vidhya/how-to-determine-the-optimal-k-for-k-means-708505d204eb

In [None]:
from sklearn.cluster import KMeans

# function returns WSS score for k values from 1 to kmax
def calculate_WSS(points, kmax):
  sse = []
  for k in range(1, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(points)
    centroids = kmeans.cluster_centers_
    pred_clusters = kmeans.predict(points)
    curr_sse = 0
    #print('k')
    # calculate square of Euclidean distance of each point from its cluster center and add to current WSS
    for i in range(len(points)):
        curr_center = centroids[pred_clusters[i]]
        #print(points[i, 1])
        #print('k')
        curr_sse += (points[i, 0] - curr_center[0]) ** 2 + (points[i, 1] - curr_center[1]) ** 2
        #print('k')
    sse.append(curr_sse)
  return sse

In [None]:
print(calculate_WSS(np.array(artworks_2D[['log_width','log_ratio']]),15))

In [None]:
wss = [[5771.555393946746, 3768.0482355463746, 1853.333445240101, \
        1255.4784780958505, 929.4386468336834, 726.9918020824192, 601.4803151716745,\
        526.726248942173, 452.2677930553403, 401.4337739897589, 368.03253374178723, \
        336.26386585668473, 309.026796026544, 285.7220886081334, 264.3795627457794]]
k = list(range(1,16))

In [None]:
(
    ggplot()
    +aes(k,wss)
    +geom_line()
    #+scale_x_continuous(breaks = (k, n = 10))
    +ggtitle('Elbow method')
)

#### As it's not completely seen, we'll also use Silhouette method

In [None]:
from sklearn.metrics import silhouette_score

sil = []
kmax = 10
x = np.array(artworks_2D[['log_width','log_ratio']])
# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for k in range(2, kmax+1):
  kmeans = KMeans(n_clusters = k).fit(x)
  labels = kmeans.labels_
  sil.append(silhouette_score(x, labels, metric = 'euclidean'))

In [None]:
(
    ggplot()
    +aes(list(range(2,11)),sil)
    +geom_line()
    #+scale_x_continuous(breaks = (k, n = 10))
    +ggtitle('Silhouette method')
)

### Prediction

In [None]:
#Initialize the class object
kmeans = KMeans(n_clusters= 3)
 
#predict the labels of clusters.
label = kmeans.fit_predict(x)
 
artworks_2D['label'] = label

In [None]:
#Here need to finish the plot of kmeans

In [None]:
#Getting unique labels
 
u_labels = np.unique(label)
 
#plotting the results:
 
for i in u_labels:
    plt.scatter(artworks_2D[label == i , 0] , artworks_2D[label == i , 1] , label = i)
plt.legend()
plt.show()