# Cosine Similarity

### Set up

In [1]:
import lib.database_module as db
import lib.encoding_module as en
import lib.download_mine as down
import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
import unidecode
import string

## Cosine Similarity, based on the full text of the category.

`category_text` is results of a SQL query to return a category_id and the text all pages in that category.  The results are split into a training and test set.  The training set is merged into a single category vector.  The test set is left as individual pages.  The training sets are used to generate a vectorizor and the vectors of each category are returned for comparison.

In [2]:
category_text = db.fetch_all_category_pages('local')

train_text, test_text = train_test_split(category_text,test_size = .15)

cate_train_dict = {}
for p in train_text:
    cate_train_dict[p[0]] = cate_train_dict.get(p[0],p[1])+ ' '+p[1]   

cate_text_dict = {}
for k, text in tqdm.tqdm_notebook(cate_train_dict.items()):
    cate_text_dict[k] = en.clean_page_text(text)

categories, texts = zip(*cate_text_dict.items())

cat_vectorizer, cat_vectors = en.build_vectorizer(texts)




In [3]:
len(train_text)

4029

In [4]:
def cos_similarity_compare(page_vector, cate_vectors, category_list):
    cos = cosine_similarity(cate_vectors,page_vector)
    best = cos.argmax()
    match1 = category_list[best]
    cos = np.delete(cos,best)
    category_list = np.delete(category_list,best)
    penultimate = cos.argmax()
    match2 = category_list[penultimate]
    return match1, match2

In [5]:
def test_cos_similarity(cate_text_list, vectorizer, category_vectors, categories):
    temp_categories= []
    temp_texts=[]
    temp_results = []
    for category, text in tqdm.tqdm_notebook(cate_text_list):
        temp_categories.append(category)
        temp_texts.append(en.clean_page_text(text))

    text_vectors = vectorizer.transform(temp_texts)
    
    test_vectors = zip (temp_categories, text_vectors)

    for t in tqdm.tqdm_notebook(test_vectors):
        pred, pred2 = cos_similarity_compare(t[1].reshape(1,-1),
                                      category_vectors,
                                      np.array(categories))
        actual = t[0]

    #    print actual, pred, pred2
        if pred == actual:# or pred2 == actual: #Include 2 best matches
            temp_results.append(int(1))
        else:
            temp_results.append(int(0))

    accuracy = float(sum(temp_results))/len(cate_text_list)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [6]:
print "train Score", test_cos_similarity(train_text,cat_vectorizer,cat_vectors,categories)
print "Test Score", test_cos_similarity(test_text,cat_vectorizer,cat_vectors,categories)

train Score

 Accuracy: 94.12%
None
Test Score

 Accuracy: 92.28%
None


In [7]:
Subcategories_dict = {'Sandwiches':'American sandwiches',
                'desserts':'cookies',
                'chemistry':'Industrial gases',
                'physics':'physicists',
                'sports cars': 'muscle cars',
                'psychology':'Popular psychology',
                'Arcade games':'Cancelled arcade games',
                'machine learning':'Classification algorithms',
                'cat breeds':'Natural cat breeds',
                'Earth sciences':'Geochemistry',
                'Breads':'Bread dishes',
                'Automotive technologies':'Vehicle dynamics',
                'belief':'Ignorance',
                'hygiene':'Ritual purification',
                'sports terminology':'martial arts terminology',
                'shoes':'Sneaker culture',
                'influenza':'Influenza researchers',
                'Children':'Sons of Odin',
                'physical quantities':'Density',
                'Physical exercise':'hydrotherapy',
                'health care':'medical robotics'
                }

In [8]:
Subcategory_number_dict = {}

for k,v in tqdm.tqdm_notebook(Subcategories_dict.items()):
    parent = down.wikipedia_get_page(k, category=True)
    child =  down.wikipedia_get_page(v, category=True)
    Subcategory_number_dict[v] ={'parent_name':k,
                                 'parent_id':int(parent.keys()[0]) , 
                                 'category_id': int(child.keys()[0]),
                                 'pages' :{p['pageid']: down.get_text(p['pageid']) 
                                           for p in down.wikipedia_get_pages_for_category(v) 
                                           if p['ns'] == 0}}

    




In [9]:
# down.get_text(k)
validation_text_list = []
for k, v in Subcategory_number_dict.items():
    for key,text in v['pages'].items():
        validation_text_list.append([v['parent_id'], en.clean_page_text(text)])
print "Validation Score", test_cos_similarity(validation_text_list,cat_vectorizer,cat_vectors,categories)

Validation Score

 Accuracy: 72.31%
None


In [None]:
validation_text_list[3]


In [10]:
X_train_temp = [en.clean_page_text(t[1]) for t in tqdm.tqdm_notebook(train_text)]
y_train_tf = [t[0] for t in tqdm.tqdm_notebook(train_text)]
X_test_temp = [en.clean_page_text(t[1]) for t in tqdm.tqdm_notebook(test_text)]
y_test_tf = [t[0] for t in tqdm.tqdm_notebook(test_text)]
X_val_temp = [en.clean_page_text(t[1]) for t in tqdm.tqdm_notebook(validation_text_list)]
y_val_tf = [t[0] for t in tqdm.tqdm_notebook(validation_text_list)]









In [11]:
vectorizer, X_tf_train = en.build_tfidf_vectorizer(X_train_temp)

In [12]:
X_tf_test = vectorizer.transform(X_test_temp)
X_tf_validation = vectorizer.transform(X_val_temp)

In [None]:
len(vectorizer.get_feature_names())

131072/2



In [18]:
import sklearn.neural_network as nn

In [None]:
mlpc = nn.MLPClassifier(verbose =3, 
                        max_iter=2000, 
                        activation = 'logistic',
                        solver='adam',
                        hidden_layer_sizes=(131072,256) ,
                        tol=.001)
                    
mlpc.fit(X_tf_train, y_train_tf)

In [1]:
print "Training Set Accuracy: {:.1%}".format(mlpc.score(X_tf_train,y_train_tf))
print "Testing Set Accuracy: {:.1%}".format(mlpc.score(X_tf_test,y_test_tf))
print "Validation Set Accuracy: {:.1%}".format(mlpc.score(X_tf_validation,y_val_tf))

NameError: name 'mlpc' is not defined