In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#association rules
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#clustering
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

from IPython.display import display

## 1. Checking whether there is any relationship between information about superheroes and superpowers they have

### Data preparation

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
heroes_info = pd.read_csv('/kaggle/input/superhero-set/heroes_information.csv')
super_powers = pd.read_csv('/kaggle/input/superhero-set/super_hero_powers.csv')

In [None]:
heroes_info.columns
# heroes_info.head()

In [None]:
heroes_info = heroes_info.drop('Unnamed: 0', axis=1)
heroes_info.head()

In [None]:
super_powers.head()

In [None]:
# Change the column name so that it can be used as id for datasets merge
column_names = super_powers.columns.values
column_names[0] = 'name'
super_powers.columns = column_names

In [None]:
# Encoding of nulls and surely incorrect data
heroes_info['Weight'].replace(-99.0, np.nan, inplace=True)
heroes_info['Height'].replace(-99.0, np.nan, inplace=True)
heroes_info.replace('-', np.nan,inplace=True)

In [None]:
heroes_info.info()

Variable *Skin color* is deleted, as it contains too many null values.

In [None]:
heroes_info = heroes_info.drop('Skin color', axis=1)

In [None]:
data = pd.merge(heroes_info, super_powers, on='name')
data.info()

In [None]:
print(data.isna().sum()['Weight'])
print(data.isna().sum()['Height'])

#### Fill in the missing data on height and weight - done based on the mean value of race and gender of the specific hero

In [None]:
data['Height'] = data['Height'].fillna(data.groupby(['Race','Gender'])['Height'].transform('mean'))
data['Weight'] = data['Weight'].fillna(data.groupby(['Race','Gender'])['Weight'].transform('mean'))

In [None]:
print(data.isna().sum()['Weight'])
print(data.isna().sum()['Height'])

In [None]:
data[data['Height'].isna()].head()
# data[data['Weight'].isna()]

In [None]:
# data[data.isnull().sum(axis=1) < 3].count()

Missing data still exists, and as can be seen most of the heros don't have the big part of the information. Having that in mind, I decided to drop those rows and analyse only the heroes with full information.

In [None]:
data.dropna(inplace=True)

In [None]:
data.head()

In [None]:
data.info()

#### Converting categorical variables to binary ones

In [None]:
data = pd.get_dummies(data, columns=['Gender', 'Eye color', 'Race',
                                     'Hair color','Publisher','Alignment'],
                      drop_first=True)
data.replace(0, False, inplace=True)
data.replace(1, True, inplace=True)

In [None]:
data.head()

### Pearson Correlation

In [None]:
corr_matrix = data.corr().abs()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
tri_df = corr_matrix.mask(mask)

In [None]:
corr_matrix

In [None]:
# corr_matrix = corr_matrix.stack()
power_cols = data.columns[3:168].tolist()

In [None]:
for idx, x in corr_matrix[tri_df > 0.5].stack().sort_values(ascending=False).iteritems():
    if ((idx[1] in power_cols) & (idx[0] not in power_cols)):
     print(idx,x)

As you can see above,the correlation between the information about the superhero and their superpower exist - especially it's seeable when it comes to the race of the hero.

# 2. Can we cluster the superheroes based on their superpowers only?

In [None]:
data = pd.read_csv('/kaggle/input/superhero-set/super_hero_powers.csv')

In [None]:
data.head()

### Association rules induction

In [None]:
frequent_itemsets = apriori(data.drop('hero_names',axis=1), min_support=0.05, use_colnames=True)

In [None]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

#### Sorting by  *confidence*

In [None]:
# type(rules)
rules.sort_values(by='confidence', ascending=False).head()

#### Sorting by *lift*

In [None]:
rules.sort_values(by='lift', ascending=False).head()

#### Sorting by *leverage*

In [None]:
rules.sort_values(by='leverage', ascending=False).head()

#### Conclusion

The number of superheroes is relatively small compared to the number of superpowers, making the *support* of some of them, and therefore of the sets containing them, very low. This makes it difficult to draw meaningful conclusions, assuming that the available data would only be a sample, and we would like to generalize them for the entire population. This can be seen, for example, by the values in the *leverage* column, which are very low (around 0.05) and at the same time they have very high *lift* ratio, which promotes strong connections even with little support.

However, assuming that the data you have is the entire superhero population, you can draw many conclusions with 100% certainty - for example, many sets of superpowers clearly indicate the simultaneous occurrence of *Super Strength*, and if you sort the rules by the *lift* coefficient, you can see that it has a high value for different sets of superpowers.

To sum up, association rules confirm the dependencies in the occurrence of individual superpowers, but due to the properties of the data (a large number of parameters, a small number of observations), I believe that they should be viewed with a certain distance.

### Correlation

In [None]:
corr = data.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr, cmap='Blues', center=0,linewidths=.1)

In the above graph, darker areas can be seen in spots, which indicates the correlation of some superpowers, but due to their number, it is unreadable and does not contain all the variables.

In [None]:
data.corr().unstack().sort_values(ascending=False).drop_duplicates().head(20)

In [None]:
data.corr().unstack().sort_values(ascending=False).drop_duplicates().tail()

The study of the correlation value shows that a large proportion of the superpowers are positively correlated to some extent. In the case of several superpowers visible above, this is a strong correlation, but moderate and low relationships are much more common. Negative correlations are very weak or absent.

I believe that correlation analysis, like the analysis of association rules, confirms the existence of dependencies and provides the premisis that we are able to group superheroes according to their superpowers.

### Clustering

In [None]:
squared_dist_sum = []
k_list = range(1,60)
for k in k_list:
    km_model = KMeans(n_clusters=k)
    km_model = km_model.fit(data.drop('hero_names', axis=1))
    squared_dist_sum.append(km_model.inertia_)

In [None]:
plt.plot(k_list, squared_dist_sum)
plt.xlabel('k')
plt.ylabel('Suma odległości')
plt.show()

The *Elbow method* does not clearly indicate the best number of clusters - you cannot see a clear bend point in the graph, therefore I decided to choose 16 as the number of clusters. Should a problem indicate the need for more homogeneous groups, the number of clusters should be increased.


In [None]:
km_model = KMeans(16)
km_model = km_model.fit(data.drop('hero_names',axis=1))

In [None]:
clusters = km_model.predict(data.drop('hero_names',axis=1))
data['Cluster'] = clusters
data[data['Cluster']==0].head()

#### t-SNE visualisation

In [None]:
np.random.seed(1)
tsne_clusters=TSNE(perplexity=27, verbose=1)
tsne_results=tsne_clusters.fit_transform(data.drop('hero_names', axis=1))
plt.figure(figsize=(16,16))
plt.scatter(tsne_results[:,0], tsne_results[:,1], c=km_model.labels_, cmap='plasma')

Size of individual groups.

In [None]:
data['Cluster'].value_counts().plot.pie(figsize=(10,10),autopct='%1.f%%')

For every cluster:
5 representatives
5 most common skills + percentage of occurance

In [None]:
for i in range(data['Cluster'].nunique()):
    print("Cluster no: {}".format(i))
    
    group_members= data[data['Cluster']==i]
    group_members_count = data[data['Cluster']==i].count()[0]
    
    print("Representatives: ", end=' ')
    
    reprs_count=5
    if group_members_count < 5:
        reprs_count = group_members_count
        
    for j in range(reprs_count):
        print(group_members.iloc[j]['hero_names'], end='; ')
    print('')
    
    super_powers = data[data['Cluster']==i].drop(['hero_names','Cluster'],axis=1).sum().to_dict()
    super_powers = sorted(super_powers.items(), key=lambda x:x[1], reverse=True)
    
    print('Most common skills with percentage of occurance:')
    for super_power in super_powers[:5]:
        print(super_power[0] + ' --> ' + format((super_power[1]/group_members_count), '.0%'))
    print('##############################################################################')

You can see from the above printout that most of the groups have been correctly identified on the basis of the dominant superpowers of their members. Many groups were distinguished very precisely.

Group 2 definitely stands out from the rest of the group, with 27% of superheroes who could not be classified into the other groups.

#### Cluster 2

In [None]:
data['powers_count'] = data.sum(axis=1)

In [None]:
data[data['Cluster']==2]['powers_count'].mean()

In [None]:
data[data['Cluster']!=2]['powers_count'].mean()

As you can see, on average, super heroes from group 2 have less than 5 superpowers, while in the case of the rest the average is over 17 superpowers. This was probably the main reason behind the formation of such a large group of rather not similar superheroes.