# 2020-02-05

Although running a validation test on the similarity between text and comment is mostly subjective, I will use this subjective means in order to determine an estimate of the accuracy of the model

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import pandas as pd
import numpy as np
import spacy
import json
import openpyxl

In [2]:
insight_dir = 'C:\\Users\\Ronald Maj\\Documents\\GitHub\\InsightDataProject\\'

chan_info_df = pd.read_csv(insight_dir + 'data\\cleaned\\all_channels_info.csv', index_col=0)
vids_df = pd.read_csv(insight_dir+'data\\cleaned\\all_videos_dup_na_clean_df.csv', index_col=0)
comms_df = pd.read_csv(insight_dir+'data\\cleaned\\all_comments_dup_na_clean_df.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
nlp = spacy.load("en_core_web_lg")

database_mat = np.load('C:\\Users\\Ronald Maj\\Documents\\GitHub\\InsightDataProject\\data\\cleaned\\comms_mat_all.npy')

# Turning large numbers into 'human' form
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])

In [4]:
# Change the function slightly from before in order to choose the number of comments displayed:

from itertools import islice

def take(n, iterable):
    # Return first n items of the iterable as a list
    return list(islice(iterable, n))

def get_comment_channel_results(search_term, num_comms, num_disp):
    #### Put in search term of interest:
    search_doc = nlp(search_term)
    
    # Get the vector representation:
    search_vec = search_doc.vector.reshape(1,-1)
    
    #### Create similarity vector
    sim_vec = cosine_similarity(search_vec,database_mat)
    
    # Assign the scores to the dataframe and sort by the similarity scores
    comms_df['sim_score'] = (sim_vec[0])

    rel_comms_df = comms_df.sort_values(by='sim_score',ascending=False).head(num_comms)
     
    # Create a results_info_df that collects all the relevant information on 
    # the resultant df:
    results_info_df = rel_comms_df[[
                                    'CommID',
                                    'authorChannelUrl',
                                    'authorDisplayName',
                                    'authorProfileImageUrl',
                                    'parentId',
                                    'publishedAt',
                                    'textDisplay',
                                    'videoId',
                                    'sim_score']].copy(deep=True)
    
    #### Need to rename the column with the video ID to match up with that in the vids_df
    cols = list(results_info_df.columns)
    cols[-2] = 'VidID'
    results_info_df.columns = cols
    results_info_df = results_info_df.merge(
        right=vids_df,
        how='left',
        on='VidID',
        suffixes=('_comm', '_vid'))
    
    # Remove duplicated comments:
    results_info_df.drop_duplicates(subset='CommID', inplace=True)
    
    #### Lastly get the channel info:
    
    # Rename common columns:
    cols = list(results_info_df.columns)    
    cols[5] = 'publishedAt_comm'    
    results_info_df.columns = cols    
    results_info_df = results_info_df.merge(
        right=chan_info_df,
        how='left',
        on='ChannelID',
        suffixes=('_vid', '_chan'))
    
    # Remove duplicated comments:
    results_info_df.drop_duplicates(subset='CommID', inplace=True)
    
    # Create a listing of the channels associated with the comments, 
    # in order of sum of the sim_score
    
    sim_sun_dict = {}

    for chanID in set(results_info_df['ChannelID']):
        try:
            sim_sun_dict[chanID] = sum(results_info_df[results_info_df['ChannelID'] == chanID]['sim_score'])
        except:
            print('There was an error in processing the sum of the similarity score')
    
    sim_sun_sorted_dict = {key: val for key, val 
                           in 
                           sorted(sim_sun_dict.items(), 
                                  reverse=True, 
                                  key=lambda item: item[1])}
    
    top_chans = take(num_disp, sim_sun_sorted_dict.items())
    
    comm_count = Counter(results_info_df['ChannelID'])
    comm_count_list = comm_count.most_common()
    
    chan_dict_list= [human_format(len(sim_sun_sorted_dict))]
    
    
    
    # Send the relevant info to the website:

    for chan_id,count in top_chans:
        chan_result = results_info_df[results_info_df['ChannelID'] == chan_id].iloc[0]
        thumb_dict_str = chan_result['thumbnails']
        thumb_dict_str = thumb_dict_str.replace("\'", "\"")
        thumb_dict = json.loads(thumb_dict_str)
        prof_pic_url = thumb_dict['default']['url']
        chan_result_dict = {
            'Channel Name':chan_result['title'],
            'Chan_url':'https://www.youtube.com/channel/'+chan_id,
            'Comment Fraction':human_format(count),
            'No. Subscribers':human_format(chan_result['subscriberCount']),
            'No. Views':human_format(chan_result['viewCount_chan']),
            'Likes/Views (on video)':round(chan_result['likeCount'] / chan_result['viewCount_vid'],5),
            'Comments/Views (on video)':round(chan_result['commentCount_vid'] / chan_result['viewCount_vid'],5),
            'Chan Profile Pic':prof_pic_url
            }
        chan_dict_list.append(chan_result_dict)
    
    comm_dict_list= [len(results_info_df)]
    
    for i in results_info_df[0:num_disp].index:
        comm_result_dict = {
        'Name':results_info_df.iloc[i]['authorDisplayName'],
        'Comment':results_info_df.iloc[i]['textDisplay'],
        'Sim Score':round(results_info_df.iloc[i]['sim_score'],2),
        'Profile Pic':results_info_df.iloc[i]['authorProfileImageUrl'],
        'Vid_url':'https://www.youtube.com/watch?v='+results_info_df.iloc[i]['VidID'],
        'Vid_title':results_info_df.iloc[i]['VidTitle'],
        'Channel Name':results_info_df.iloc[i]['title'],
        'Chan_url':'https://www.youtube.com/channel/'+results_info_df.iloc[i]['ChannelID']        
            }
        comm_dict_list.append(comm_result_dict)

    return chan_dict_list, comm_dict_list

Now the plan is to have a number of topics for analysis, produce the list of comments and decide whether the comment is or is not similar to the given input text.

The percentage of related comments gives an indication of accuracy of the methodology

In [5]:
brands = ['apple technology','canon camera','tesla car','nivea','burger king']
emotions = ['interested','excited','blessed','happy','comfortable']

In [6]:
brand_emotions = []
for brand in brands:
    for emotion in emotions:
        brand_emotions.append(brand+' '+emotion)

In [7]:
brand_emotions

['apple technology interested',
 'apple technology excited',
 'apple technology blessed',
 'apple technology happy',
 'apple technology comfortable',
 'canon camera interested',
 'canon camera excited',
 'canon camera blessed',
 'canon camera happy',
 'canon camera comfortable',
 'tesla car interested',
 'tesla car excited',
 'tesla car blessed',
 'tesla car happy',
 'tesla car comfortable',
 'nivea interested',
 'nivea excited',
 'nivea blessed',
 'nivea happy',
 'nivea comfortable',
 'burger king interested',
 'burger king excited',
 'burger king blessed',
 'burger king happy',
 'burger king comfortable']

In [8]:
brands_comments_df = pd.DataFrame(columns=['Brand','Comment','Sim_score','Vid','Chan','Actually similar?'])

In [9]:
num_comms = 25
num_disp = 20
for brand in brands:
    search_term = brand
    chan_dict_list, comm_dict_list = get_comment_channel_results(search_term, num_comms, num_disp)
    for comm in comm_dict_list[1:]:
        data = {
            'Brand':brand,
            'Comment':comm['Comment'],
            'Sim_score':comm['Sim Score'],
            'Vid':comm['Vid_title'],
            'Chan':comm['Channel Name'],
            'Actually similar?':None
        }
        brands_comments_df = brands_comments_df.append(data, ignore_index=True)

In [10]:
brands_comments_df

Unnamed: 0,Brand,Comment,Sim_score,Vid,Chan,Actually similar?
0,apple technology,Apple,0.79,LAST TO LAUGH CHALLENGE!!,Alexa Rivera,
1,apple technology,Apple 🍎 🤣,0.79,APPLE product for the first time 😍,Dimpu's Vlogs,
2,apple technology,And Apple,0.78,The Mountains Won't Remember Me,Peter McKinnon,
3,apple technology,APPLE APPLE\n\nJOIN YOUR HEARTS LIKE AN APPLE,0.75,We Had A McDonald's Wedding In Hong Kong,Safiya Nygaard,
4,apple technology,"“APPLE, APPLE, JOIN YOUR HEARTS LIKE AN APPLE” 😂",0.74,We Had A McDonald's Wedding In Hong Kong,Safiya Nygaard,
...,...,...,...,...,...,...
95,burger king,What if burger king did a super bowl commercia...,0.71,ONE LAST ATTEMPT..,CaseyNeistat,
96,burger king,bread queen YYAASSD,0.71,If this video gets 100k likes my editor gets a...,RoomieOfficial,
97,burger king,bread queen 🍞🥖🥨🥯🥐,0.71,If this video gets 100k likes my editor gets a...,RoomieOfficial,
98,burger king,Zepyro the king!,0.71,i'm going back to NYC.,CaseyNeistat,


In [11]:
#brands_comments_df.to_csv('Validation_Set_Unranked.csv')

In [12]:
#!pip install openpyxl

In [13]:
#brands_comments_df.to_excel("Validation_Set_Unranked.xlsx")  

In [14]:
#!pip install xlrd

In [15]:
brands_ranked_df = pd.read_excel("Validation_Set_Ranked.xlsx")

In [16]:
brands_ranked_df

Unnamed: 0.1,Unnamed: 0,Brand,Comment,Sim_score,Vid,Chan,Actually similar?
0,0,apple technology,Apple,0.79,LAST TO LAUGH CHALLENGE!!,Alexa Rivera,True
1,1,apple technology,Apple 🍎 🤣,0.79,APPLE product for the first time 😍,Dimpu's Vlogs,True
2,2,apple technology,And Apple,0.78,The Mountains Won't Remember Me,Peter McKinnon,True
3,3,apple technology,APPLE APPLE\n\nJOIN YOUR HEARTS LIKE AN APPLE,0.75,We Had A McDonald's Wedding In Hong Kong,Safiya Nygaard,False
4,4,apple technology,"“APPLE, APPLE, JOIN YOUR HEARTS LIKE AN APPLE” 😂",0.74,We Had A McDonald's Wedding In Hong Kong,Safiya Nygaard,False
...,...,...,...,...,...,...,...
95,95,burger king,What if burger king did a super bowl commercia...,0.71,ONE LAST ATTEMPT..,CaseyNeistat,True
96,96,burger king,bread queen YYAASSD,0.71,If this video gets 100k likes my editor gets a...,RoomieOfficial,False
97,97,burger king,bread queen 🍞🥖🥨🥯🥐,0.71,If this video gets 100k likes my editor gets a...,RoomieOfficial,False
98,98,burger king,Zepyro the king!,0.71,i'm going back to NYC.,CaseyNeistat,False


In [17]:
sum(brands_ranked_df['Actually similar?'])

38

In [18]:
sum(brands_ranked_df[brands_ranked_df['Brand'] == 'apple technology']['Actually similar?'])

10

In [19]:
grouped = brands_ranked_df.groupby('Brand')

In [20]:
grouped.sum()[['Actually similar?']]*(100/20) # Convert to percentage

Unnamed: 0_level_0,Actually similar?
Brand,Unnamed: 1_level_1
apple technology,50.0
burger king,30.0
canon camera,30.0
nivea,45.0
tesla car,35.0


So we see that the results, as a percentage aren't that great for longer search terms.

Let's try shorter:

In [21]:
brands_short = ['apple','canon','tesla','nivea','mcdonalds']

In [22]:
def val_table_unranked(topics,num_comms,num_disp):
    results_df = pd.DataFrame(columns=['Brand','Comment','Sim_score','Vid','Chan','Actually similar?'])
    for brand in topics:
        search_term = brand
        chan_dict_list, comm_dict_list = get_comment_channel_results(search_term, num_comms, num_disp)
        for comm in comm_dict_list[1:]:
            data = {
                'Brand':brand,
                'Comment':comm['Comment'],
                'Sim_score':comm['Sim Score'],
                'Vid':comm['Vid_title'],
                'Chan':comm['Channel Name'],
                'Actually similar?':None
            }
            results_df = results_df.append(data, ignore_index=True)
    return results_df

In [23]:
short_brands_df = val_table_unranked(brands_short,25,20)

In [24]:
short_brands_df

Unnamed: 0,Brand,Comment,Sim_score,Vid,Chan,Actually similar?
0,apple,Apple,1.00,LAST TO LAUGH CHALLENGE!!,Alexa Rivera,
1,apple,Apple 🍎 🤣,1.00,APPLE product for the first time 😍,Dimpu's Vlogs,
2,apple,And Apple,0.88,The Mountains Won't Remember Me,Peter McKinnon,
3,apple,Temima Apple no,0.85,Noob vs Pro Unicorn Cake Challenge! w/ Rosanna...,Joey Graceffa,
4,apple,Apple < PC,0.84,Mac Pro and Pro Display XDR Unboxing!,iJustine,
...,...,...,...,...,...,...
95,mcdonalds,KFC for me 🤦🏻‍♀️ lol,0.57,What I Eat In A Day As A Model // Romee Strijd,Romee Strijd,
96,mcdonalds,So is Carls JR Hardees?,0.57,CHICKEN SANDWICH CHALLENGE!,Rosanna Pansino,
97,mcdonalds,MATHEW MCDONALD nnuunnniî,0.56,The Couples of Pinterest,Cody Ko,
98,mcdonalds,Wassup Pepsi,0.55,SECURITY WAS CALLED!!,Shammi Vlogs,


In [25]:
#short_brands_df.to_csv('Validation_Set_Short_Unranked.csv')

In [26]:
#short_brands_df.to_excel("Validation_Set_Short_Unranked.xlsx") 

In [27]:
short_brands_ranked_df  = pd.read_excel("Validation_Set_Short_Ranked.xlsx")

In [28]:
short_brands_ranked_df.groupby('Brand').sum()[['Actually similar?']]*(100/20)

Unnamed: 0_level_0,Actually similar?
Brand,Unnamed: 1_level_1
apple,45.0
canon,100.0
mcdonalds,50.0
nivea,45.0
tesla,70.0


So we can see that the use of shorter terms actually produces a higher accuracy. More specificity (especially for brand names) is important to get the comments most related to the search term.