## This is an excercise to find the most frequent words from CountVectorizer and TfidfTransformer

#### This could be a form of feature engineering if needed

In [1]:
import numpy as np
import pandas as pd

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import cufflinks as cf

init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
import nltk # Imports the library
nltk.download() #Download the necessary datasets

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### I am using the slogan dataset from Kaggle

In [4]:
df = pd.read_csv('sloganlist.csv')

In [5]:
df.head(10)

Unnamed: 0,Company,Slogan
0,Costa Coffee,For coffee lovers.
1,Evian,Evian. Live young.
2,Dasani,Designed to make a difference.
3,Heineken,It's all about the beer.
4,Gatorade,The Legend Continues.
5,TÃ­o Pepe,Good food tastes better after a glass of Tio Pepe
6,Tetley's Brewery,Don't Do Things By Halves.
7,Batemans Brewery,Good Honest Ales
8,Jones Soda,Run with the little guyâ¦ create some change.
9,Grapette,Thirsty or Not.


In [6]:
df.describe()

Unnamed: 0,Company,Slogan
count,1162,1162
unique,569,568
top,Domino's Pizza,Taste The Feeling.
freq,31,31


---

### From describe() looks like the csv file has many duplicate data. So I'll just remove all the duplicates

In [7]:
df = df.drop_duplicates() 
df

Unnamed: 0,Company,Slogan
0,Costa Coffee,For coffee lovers.
1,Evian,Evian. Live young.
2,Dasani,Designed to make a difference.
3,Heineken,It's all about the beer.
4,Gatorade,The Legend Continues.
...,...,...
1136,Hardee's,Where the food's the star.
1137,Burger King,Have it your way.
1138,Kit Kat,"Have a Break, Have a Kit Kat."
1139,Subway,Eat Fresh.


In [8]:
df.describe()

Unnamed: 0,Company,Slogan
count,569,569
unique,569,568
top,Del Taco,Exquisite wodka.
freq,1,2


In [9]:
df[df['Slogan'] == 'Exquisite wodka.']

Unnamed: 0,Company,Slogan
334,Wyborowa Vodka,Exquisite wodka.
448,Stolichnaya vodka,Exquisite wodka.


### Two different brands of vodka with the same slogan so I'll let it be

---

### Let's process the slogans

In [10]:
# Abbreviations of known words
abbrv_list = ['lol', 'lmao', 'rofl', 'ive', 'youve', 'brb', 'ttyl', 'im']
special_chars_list = ['â', '', '', 'Ã', '©']

In [11]:
import string
from nltk.corpus import stopwords, wordnet


def text_process(msg):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation and special chars
    2. Replace abbrv with words
    3. Remove all stopwords
    4. Lemmatize words by removing plurals
    5. Returns a list of the cleaned text
    """
    no_punc = []
    for char in msg:
        if char not in string.punctuation and char not in special_chars_list:
            no_punc.append(char)
       
    no_punc = ''.join(no_punc)
    no_punc_word_list = no_punc.split()
    
    cleaned_msg = []
    for word in no_punc_word_list:
        
        if word.lower() not in stopwords.words('english') and word.lower() not in abbrv_list:
            word_lower_case = word.lower()
            word_lemmatized = wordnet.morphy(word_lower_case)
            
            if word_lemmatized is None:
                use_word = word_lower_case
            else:
                use_word = word_lemmatized
                
            cleaned_msg.append(use_word)
    
    cleaned_msg = ' '.join(cleaned_msg)

    return cleaned_msg

In [12]:
col = 'Slogan'

In [16]:
cleaned_df = df
cleaned_df[col] = df[col].apply(text_process)

---

### Here are my cleaned slogans

In [17]:
cleaned_df[col]

0                 coffee lover
1             evian live young
2       design make difference
3                         beer
4              legend continue
                 ...          
1136                 food star
1137                       way
1138             break kit kat
1139                 eat fresh
1141                     lovin
Name: Slogan, Length: 569, dtype: object

---

### Vectorization of Slogans

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
# tokenize and build vocab
count_vectorizer = CountVectorizer().fit(cleaned_df[col])

In [20]:
# summarize
print(count_vectorizer.vocabulary_)

{'coffee': 133, 'lover': 424, 'evian': 219, 'live': 415, 'young': 808, 'design': 180, 'make': 433, 'difference': 183, 'beer': 46, 'legend': 403, 'continue': 143, 'good': 292, 'food': 261, 'taste': 691, 'better': 53, 'glass': 288, 'tio': 718, 'pepe': 515, 'dont': 190, 'things': 708, 'half': 316, 'honest': 342, 'ale': 13, 'run': 598, 'little': 414, 'guy': 312, 'create': 160, 'change': 111, 'thirsty': 711, 'juice': 378, 'jiffy': 375, 'bring': 84, 'crazy': 157, 'genki': 280, 'hatsuratsu': 327, 'hey': 338, 'culligan': 165, 'belvedere': 49, 'always': 14, 'go': 289, 'smoothly': 649, 'pure': 553, 'spirit': 661, 'vodka': 763, 'take': 687, 'white': 784, 'horse': 345, 'anywhere': 19, 'friend': 271, 'old': 490, 'parr': 504, 'royal': 593, 'race': 561, 'place': 529, 'strong': 679, 'healthier': 330, 'gum': 311, 'drink': 194, 'fanta': 229, 'stay': 669, 'bamboocha': 36, 'great': 300, 'head': 329, 'first': 249, 'pizza': 528, 'deliver': 178, 'expert': 223, 'peace': 511, 'love': 423, 'ice': 352, 'cream': 

In [21]:
# Vector representation of all msgs
all_msgs_vector = count_vectorizer.transform(cleaned_df[col])
print(all_msgs_vector)

  (0, 133)	1
  (0, 424)	1
  (1, 219)	1
  (1, 415)	1
  (1, 808)	1
  (2, 180)	1
  (2, 183)	1
  (2, 433)	1
  (3, 46)	1
  (4, 143)	1
  (4, 403)	1
  (5, 53)	1
  (5, 261)	1
  (5, 288)	1
  (5, 292)	1
  (5, 515)	1
  (5, 691)	1
  (5, 718)	1
  (6, 190)	1
  (6, 316)	1
  (6, 708)	1
  (7, 13)	1
  (7, 292)	1
  (7, 342)	1
  (8, 111)	1
  :	:
  (557, 341)	1
  (558, 261)	1
  (558, 323)	1
  (558, 434)	1
  (558, 514)	1
  (559, 16)	1
  (559, 261)	1
  (560, 184)	1
  (560, 653)	1
  (561, 269)	1
  (561, 641)	1
  (562, 585)	1
  (562, 691)	1
  (563, 92)	1
  (563, 501)	1
  (563, 709)	1
  (564, 261)	1
  (564, 667)	1
  (565, 775)	1
  (566, 79)	1
  (566, 382)	1
  (566, 389)	1
  (567, 201)	1
  (567, 268)	1
  (568, 425)	1


---

### Applying TF-IDF algorithm

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer

In [23]:
tfidf_transformer = TfidfTransformer().fit(all_msgs_vector)

In [24]:
messages_tfidf = tfidf_transformer.transform(all_msgs_vector)
print(messages_tfidf)

  (0, 424)	0.8120610477121847
  (0, 133)	0.583572493173375
  (1, 808)	0.6170194344433083
  (1, 415)	0.4884404109392671
  (1, 219)	0.6170194344433083
  (2, 433)	0.3996201195064444
  (2, 183)	0.6275069936630012
  (2, 180)	0.6682355370598582
  (3, 46)	1.0
  (4, 403)	0.7071067811865476
  (4, 143)	0.7071067811865476
  (5, 718)	0.4643575542189262
  (5, 691)	0.27533002873032236
  (5, 515)	0.4643575542189262
  (5, 292)	0.27533002873032236
  (5, 288)	0.4643575542189262
  (5, 261)	0.32371314078907243
  (5, 53)	0.31098671765630925
  (6, 708)	0.5847631354795332
  (6, 316)	0.6227173750031979
  (6, 190)	0.5198799344591708
  (7, 342)	0.6103629191218752
  (7, 292)	0.4039941472967685
  (7, 13)	0.6813558805139794
  (8, 598)	0.4714330618940757
  :	:
  (557, 84)	0.6817816882288547
  (558, 514)	0.5127066510144902
  (558, 434)	0.5946055526513928
  (558, 323)	0.460169742610059
  (558, 261)	0.41451168228148844
  (559, 261)	0.5718758868245777
  (559, 16)	0.8203401550994579
  (560, 653)	0.7071067811865476
  (56

In [25]:
print(messages_tfidf.shape)

(569, 813)


#### Going by the IDF score should be the best bet since lower score will mean more frequency. 
#### I will turn the IDF score list into a hasmap and then use the keys of sorted values to find the words 

In [26]:
idf_score_dict = {}

for i,val in enumerate(tfidf_transformer.idf_):
    idf_score_dict[i] = val   

In [27]:
import operator
sorted_idf_score_dict = dict(sorted(idf_score_dict.items(), key=operator.itemgetter(1)))

In [28]:
top_10_idf_score_list = list(sorted_idf_score_dict.keys())[0:10]
top_10_idf_score_list

[292, 691, 433, 46, 53, 283, 50, 261, 409, 695]

In [29]:
unique_word_list = count_vectorizer.get_feature_names()

for i in top_10_idf_score_list:
    try:
        print(unique_word_list[i], idf_score_dict[i])
    except Exception as ex:
        print("no word matching the index")

good 3.9444389791664403
taste 3.9444389791664403
make 3.9783405308421216
beer 4.455264602932431
better 4.455264602932431
get 4.573047638588815
best 4.637586159726386
food 4.637586159726386
life 4.637586159726386
tea 4.706579031213337


### From the result, it looks like the dataset has a lot of slogans for edible items

### Verifying my results for some of these words

In [30]:
cleaned_df[cleaned_df['Slogan'].str.contains("good")]

Unnamed: 0,Company,Slogan
5,TÃ­o Pepe,good food taste better glass tio pepe
7,Batemans Brewery,good honest ale
34,KFC(Kentucky Fried Chicken),finger lickin good
120,Sunkist (soft drink),good vibration
177,United Breweries,king good times
296,Labatt Brewing Company,good things brewing
321,Guinness,guinness good
360,Jamba Juice,good tidings blend
368,Lavazza,good karma great coffee
400,Yogi Tea,good feel


In [31]:
cleaned_df[cleaned_df['Slogan'].str.contains("taste")]

Unnamed: 0,Company,Slogan
5,TÃ­o Pepe,good food taste better glass tio pepe
39,Coca-Cola,taste feeling
47,Campa Cola,great indian taste
56,Crystal Pepsi,never see taste like
83,Dr. Brown's,taste town
90,Lilt,totally tropical taste
92,Fresca,nothing taste like fresca
99,Jolly Cola,free taste
127,Sutter Home Winery,taste commitment
133,Staropramen Brewery,get taste prague


In [32]:
cleaned_df[cleaned_df['Slogan'].str.contains("food")]

Unnamed: 0,Company,Slogan
5,TÃ­o Pepe,good food taste better glass tio pepe
95,Horlicks,food drink night
587,Dan-D Foods,fine food earth
588,Canyon Creek Food Company,favorite food make easy
920,Nestles,nestle good food good life
945,Heinz,good food every day
969,Winky's,fast food cheap
1013,Brewers Fayre,pub food
1055,Checkers and Rally's,crazy good food
1059,Village Inn,good food good feelings


### These words are very frequent in the slogans so my results are correct