### This notebook is going to apply ANN, SVM and ANOVA on the tags_all.csv file, which is the result of Google Cloud Vision tagging out dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

Read csv files and preprocess irregular data.

In [2]:
tags_df=pd.DataFrame.from_csv('/Users/mindachen/PycharmProjects/7390_Final_Project/tags_all.csv')
# fill nan with space and merge several classes to maintain genre bias
tags_df=tags_df.fillna(' ').replace('punk',value='rock').replace('metal',value='rock').replace('soul',value='rock')

# redefine indexes
tags_df.index=list(range(7620))
tags_df.head()

Unnamed: 0,Label0,Label1,Label2,Label3,Label4,Label5,Label6,Label7,Label8,Label9,...,possibility0,possibility1,possibility2,possibility3,possibility4,possibility5,possibility6,possibility7,possibility8,possibility9
0,text,yellow,font,graphic design,logo,product,design,line,illustration,graphics,...,0.955804,0.944085,0.889693,0.771218,0.748817,0.741762,0.69811,0.667353,0.64845,0.633883
1,face,child,nose,infant,eating,cheek,chin,mouth,lip,toddler,...,0.956208,0.929703,0.9149,0.884643,0.865238,0.863425,0.783846,0.762918,0.738106,0.686199
2,yellow,font,circle,graphics,computer wallpaper,graphic design,,,,,...,0.954908,0.6848,0.540338,0.534659,0.514166,0.505993,,,,
3,black,darkness,pianist,silhouette,string instrument,musical instrument,backlighting,musical instrument accessory,angle,midnight,...,0.953919,0.896526,0.806206,0.775262,0.70309,0.635771,0.619506,0.591932,0.550074,0.54217
4,girl,product,,,,,,,,,...,0.570585,0.508661,,,,,,,,


Collect all unique tags

In [4]:
tags_set=set()

label_count=0
for idx,row in tags_df.iterrows():
    for item in row.iloc[0:10].iteritems():  
        tags_set.add(item[1])
        if isinstance(item[1],str):           
            label_count+=1

tags_set=list(tags_set)
print('Total labels(unique): ', len(tags_set), '\nOverlapping rate: ', (label_count-len(tags_set))/label_count)

Total labels(unique):  1984 
Overlapping rate:  0.9739632545931759


Format the labels so that they don't contain any spaces, which may confuse the anova models.

In [5]:
tags_formatted=[]

for tag in tags_set:
    tags_formatted.append(str.join('_',tag.split()))

Label encode and one-hot encode tags

In [6]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

le=LabelEncoder()
tags_encoded = le.fit_transform(tags_set)
ohe=OneHotEncoder()
ohe.fit(np.expand_dims(tags_encoded,axis=1))

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

Encode all 'tag' columns in the dataframe

In [55]:
encoded=[]

for idx,row in tags_df.iterrows():
    # label encode each row
    _=le.transform(row[:10])
    # one-hot encode each row
    _=ohe.transform(np.expand_dims(_,axis=1)).toarray()
    # merge the encoded tags into a 1-d vector
    _=np.array(_).sum(axis=0)
    encoded.append(_)

# drop nan column, remain 1984 tags
encoded=pd.DataFrame(encoded, columns=tags_formatted)
# we set all values in tag ' ' to zero
encoded.iloc[:,0]=0
print(encoded.shape)

(7620, 1984)


One-hot encode genres.

In [84]:
le_genre=LabelEncoder()
ohe_genre=OneHotEncoder()
genres=tags_df['genre']
genres_leEncoded=le_genre.fit_transform(genres)
genres_oheEncoded=ohe_genre.fit_transform(np.expand_dims(genres_leEncoded,axis=1))

Concatenate encoded labels, genres and filenames

In [85]:
genre_encoded=pd.DataFrame(genres_oheEncoded.toarray(),
                           columns=[i+'_genre' for i in le_genre.inverse_transform(range(7))])

encoded_withLabel=pd.concat([encoded,
                             genre_encoded,
                             tags_df['file_name']],
                            axis=1)

  if diff:


In [26]:
from sklearn.model_selection import train_test_split

# split train set and test set
X_train, X_test, y_train, y_test=train_test_split(encoded,genre_encoded,test_size=0.3)

In [49]:
import keras

model=keras.Sequential()
model.add(keras.layers.Dense(input_shape=[1984],units=256,activation=keras.activations.relu))
model.add(keras.layers.Dense(1024,activation=keras.activations.relu))
model.add(keras.layers.Dropout(0.4))
model.add(keras.layers.Dense(4096,activation=keras.activations.relu))
model.add(keras.layers.Dense(1024,activation=keras.activations.relu))
model.add(keras.layers.Dropout(0.4))
model.add(keras.layers.Dense(256,activation=keras.activations.relu))
model.add(keras.layers.Dense(7,activation=keras.activations.softmax))

model.compile(optimizer=keras.optimizers.Adadelta(lr=0.001, epsilon=1e-05),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_55 (Dense)             (None, 256)               508160    
_________________________________________________________________
dense_56 (Dense)             (None, 1024)              263168    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_57 (Dense)             (None, 4096)              4198400   
_________________________________________________________________
dense_58 (Dense)             (None, 1024)              4195328   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_59 (Dense)             (None, 256)               262400    
__________

In [50]:
model.fit(X_train, y_train, 
          batch_size=512,
          epochs=10,
          validation_data=[X_test,y_test])

Train on 5334 samples, validate on 2286 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1245979e8>

Apply SVM on tags

In [61]:
from sklearn import svm
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(encoded,genres_leEncoded,test_size=0.3)

clf=svm.SVC()

clf.fit(X=X_train, y=y_train)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Calculate the accuracy of SVM model

In [62]:
from sklearn.metrics import accuracy_score

predicitons=clf.predict(X_test)
acc_svm=accuracy_score(y_true=y_test,y_pred=predicitons)
print(acc_svm)

0.39195100612423445


Apply ANOVA on the tags

In [86]:
from scipy import stats

# create an empty dic for p values
p_list={}

for genre in encoded_withLabel.columns[1984:1991]:
    # in each genre
    temp={}
    for tag in encoded_withLabel.columns[:1984]:
        # in each tag
        # cut out the slice we need from 'encoded_withLabel'
        slice=pd.concat([encoded_withLabel[tag], encoded_withLabel[genre]],axis=1)
        
        # use one-way ANOVA to analyze if containing certain tag has significant influence on certain genre
        F, p = stats.f_oneway(slice[slice[tag]==0],
                              slice[slice[tag]==1])
        temp[tag]=p[1]
    p_list[genre]=temp

p_values_df=pd.DataFrame(p_list)

  ssbn += _square_of_sums(a - offset) / float(len(a))
  f = msb / msw


In [87]:
p_values_df.head()

Unnamed: 0,ambient_genre,dubstep_genre,folk_genre,hiphop_rap_genre,jazz_genre,pop_genre,rock_genre
,0.742224,0.740377,0.73653,0.731464,0.73435,0.742594,0.216531
3d_modeling,0.742224,0.740377,0.002954,0.731464,0.73435,0.742594,0.418465
abbey,0.742224,0.740377,0.73653,0.731464,0.73435,0.742594,0.216531
abdomen,0.641802,0.057482,0.634183,0.62742,0.631271,0.054761,0.25249
academic_certificate,0.510566,0.507429,0.500919,0.492398,0.33487,0.511196,0.147208


Define a filter function

In [88]:
def tag_filter(genre, p_threshold=0.05, min_count=10):
    global p_values_df
    # filter out tags that have p value lower than threshold, sorted by p value
    slice=p_values_df[genre]
    tags_mega=slice[slice<p_threshold].sort_values(ascending=True).index.tolist()
    
    # filter out tags that rarely appear
    slice=encoded_withLabel[tags_mega]   
    tags_mega=slice.columns[slice.sum()>min_count].tolist()
    return tags_mega

Now we can, in some way, answer the question  ---

Which are the most 'influential' tags on certain genre? It's really interesting.

(Tags are sorted by p value in ascending order)

In [109]:
results=pd.DataFrame([])

for genre in genres.unique():
    results=pd.concat([results,pd.DataFrame(tag_filter(genre+'_genre',
                                                       p_threshold=0.01, 
                                                       min_count=30),
                                            columns=[genre])],
              axis=1)

results=results.transpose()
results.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,76,77
pop,outdoor_shoe,geological_phenomenon,goggles,road,battleship,mangaka,nose,love,musical_theatre,astronaut,...,,,,,,,,,,
rock,kitten,larch,sky,automotive_exterior,tower,bicycle_part,red_meat,prophet,prunus,liquid,...,,,,,,,,,,
hiphop_rap,clock,sliced_bread,electronic_device,number,kitten,postage_stamp,mellophone,living_room,tartan,boxing_glove,...,,,,,,,,,,
ambient,sofa_bed,cupboard,violone,supersonic_transport,kitten,room,estate,fedora,dreadlocks,mellophone,...,liquid_bubble,church,romance,desert,green_algae,trumpet,screen,seahorse,pancake,number
folk,viol,mangaka,car_subwoofer,kitten,metropolis,bulldog,cliff,male,musk_deer,flock,...,,,,,,,,,,


Save the result.

In [110]:
results.to_csv('/Users/mindachen/ANOVA_result.csv',header=True)

# License
This project is licensed under the MIT License - see the file [LICENSE.md](https://github.com/qiuminzhang/discogs_scrapy/blob/master/LICENSE) for details

# Citation
This project uses licensed open source python framework Scrapy - see the file [LICENSE.md](https://github.com/scrapy/scrapy/blob/master/LICENSE) for details.

For keras built-in Inception_v3 model, please refer to this https://keras.io/applications/#inceptionv3