<h1> <center> Table of Contents </center> </h1>

<div class="alert alert-block alert-info" style="margin-top: 20px">
    <ol>
        <li><a href="#1.-Required-Libraries"> Required Libraries</a></li>
        <li><a href="#2.-Importing-the-Data"> Importing the Data</a></li>
        <li><a href="#3.-Creating-Corpus"> Creating-Corpus</a> </li>
        <li><a href="#4.-Using-Fuzzywuzzy-to-get-best-matched-dashboards"> Using Fuzzywuzzy to get best matched dashboards</a></li> 
        <li><a href="#5.-Measuring-the-accuracy-of-the-code"> Measuring the accuracy of the code</a></li>
      </li>
    </ol>
</div>

In [1]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

# 1. Required Libraries 

## pip installs

In [2]:
#!pip install python-Levenshtein
#!pip install "fuzzywuzzy==0.18.0"
#!pip install rapidfuzz
#!pip install spacy
#!pip install gensim
#!pip install rank_bm25
#!python -m spacy download en_core_web_lg
#!pip install fast-autocomplete

## Imports 

In [3]:
import pandas as pd
from pandas.core.common import flatten
import numpy as np

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import rapidfuzz
from rapidfuzz import process, utils

import timeit
import time

import warnings
warnings.filterwarnings('ignore')



## 2. Importing the Data

In [4]:
#importing the file
dfdashboards = pd.read_csv('microstrategy_and_dashboard.csv')
dfmetrics = pd.read_csv('tableau_metric.csv')
dfuserquey = pd.read_csv('sample_user_searchs.csv')

In [5]:
def preprocess_datafiles(dashboard , metric):

    #converting column values to lowercase
    dashboard = dashboard.apply(lambda x: x.astype(str).str.lower())
    metric = metric.apply(lambda x: x.astype(str).str.lower())
    
    #converting global usage to numeric
    dashboard.Global_usage = pd.to_numeric(dashboard.Global_usage)
    
    #splitting caption and descriptors columns 
    dashboard = dashboard[['dashboards','caption','Global_usage']]
    dashboard['metric_names'] = dashboard.caption.str.split('|')
    metric['descriptors'] = metric.descriptors.str.split('|')
    
    #converting metric names and descriptors in each row values to multiple rows

    temp1 = dashboard.set_index(['dashboards'])['metric_names'].apply(pd.Series).stack().reset_index().drop('level_1', axis=1).rename(columns={0:'caption'})
    temp2 = metric.set_index(['metric_name'])['descriptors'].apply(pd.Series).stack().reset_index().drop('level_1', axis=1).rename(columns={0:'descriptor'})
    
    #merging metrics and descriptors

    master = temp1.merge(temp2, how = 'inner', left_on ='caption', right_on = 'metric_name')[['dashboards','metric_name','descriptor']]
    master = master.rename(columns={"dashboards":"dashboard_names", "metric_name": "metric_names", "descriptor": "descriptor_names"})

    return master



In [6]:
#"This is the final dataframe we would be using."
master = preprocess_datafiles(dfdashboards,dfmetrics)

## 3. Creating Corpus

In [8]:

dashboard_names_list = list(master.dashboard_names.unique())
metric_names_list = list(master.metric_names.unique())
descriptor_names_list = list(master.descriptor_names.unique())
mastercorpus = dashboard_names_list + metric_names_list + descriptor_names_list

mastercorpus = list(filter(None,mastercorpus))

## 4. Using Fuzzywuzzy to get best matched dashboards

FuzzyWuzzy is a library of Python which is used for string matching. Fuzzy string matching is the process of finding strings that match a given pattern. Basically it uses <b>Levenshtein Distance</b> to calculate the differences between sequences.


In [9]:
#Logic to get the dashboards

def fuzzywuzzy_suggestion(corpus, input_query):
    for i in [corpus]:
        return process.extract(input_query, i, limit=10)

def dashboard_names_suggestion(master, suggestions):
    dashboard_names = []   
    for i in suggestions:
        
        if i in dashboard_names_list:    
            dashboard_names.append(i)
        
        elif i in metric_names_list:    
            dashboard_names = dashboard_names + (master.loc[(master.metric_names.str.lower() == i)].dashboard_names).to_list()
        
        else:
            dashboard_names = dashboard_names + (master.loc[(master.descriptor_names.str.lower() == i)].dashboard_names).to_list()

    return dashboard_names

In [10]:
token_set_ratio_sugg = {}

def fuzzywuzzy_scorers_suggestions(usersearch, corpus):
    for token in corpus:
        sugg_score = fuzz.token_set_ratio(usersearch, token)
        token_set_ratio_sugg[token] = sugg_score

        
def fuzzywuzzy_scorers_similiarity(scorers,suggestions_count,sort_scorers):
   
    scoreDf = pd.DataFrame()
    scoreDf = scoreDf.from_dict([token_set_ratio_sugg]).T.reset_index()
    scoreDf.columns = ['suggestion','token_set_ratio_sugg']
#     scoreDf['mean'] = scoreDf.mean(axis = 1)
    
    return scoreDf[scorers].sort_values(by = sort_scorers, ascending = False).head(suggestions_count)

In [15]:
input_query = input()

Macro style values 11 CT 22 OZ bottle Dollar sales


In [16]:
# FUNCTION CALLING 
fuzzywuzzy_scorers_suggestions(input_query.lower(), mastercorpus)

fuzzyScorers = ['suggestion','token_set_ratio_sugg']
scorersSorting = ['token_set_ratio_sugg']

allScorersOutput = []


fuzzywuzzySearchOutput = []

# FUNCTION CALLING 
scoreSortedDf = fuzzywuzzy_scorers_similiarity(scorers=fuzzyScorers, suggestions_count = 15,sort_scorers= scorersSorting)

scoreSortedDf.sort_values(scorersSorting, ascending = False)

#FUNCTION CALLING
fuzzywuzzyDashboardsList = list(dashboard_names_suggestion(master, scoreSortedDf['suggestion'][:10]))

fuzzywuzzySearchOutput = list(dict.fromkeys(fuzzywuzzyDashboardsList))
allScorersOutput.append(fuzzywuzzySearchOutput[:10])
    

scorersSuggestionsDf = pd.concat([d.reset_index(drop=True) for d in [
                        scoreSortedDf.sort_values(['token_set_ratio_sugg'], ascending = False)[['suggestion']],
                       ]],axis=1)

scorersSuggestionsDf.columns  = ['token_set_ratio_sugg']

print('User Search : ' + str(input_query))

column_names = ['Dashboard Suggestions']
finalSuggestionDF = pd.DataFrame(allScorersOutput).transpose().set_axis(column_names, axis =1)
print("Displaying the dashboards names by each scorer methods.")
display(finalSuggestionDF)

User Search : Macro style values 11 CT 22 OZ bottle Dollar sales
Displaying the dashboards names by each scorer methods.


Unnamed: 0,Dashboard Suggestions
0,ad analysis
1,r geography over time
2,category and segment analysis
3,competitive set
4,dimensions over time
5,line geogs over time
6,market share
7,package analysis
8,rankers
9,style analysis
