# Collection research - Search categories

## Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# importing decomposition modules
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

# importing Natural Language Toolkit and other NLP modules
import nltk
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob, Word
from nltk.corpus import stopwords
import re

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Firstview

In [3]:
df = pd.read_csv('/Users/rajathadri_as/Documents/FAD - searchTerms.csv')

In [4]:
pd.set_option('display.max_rows',None)

In [6]:
df.head()

Unnamed: 0,Search term,Total users
0,Massage,14814
1,massage,4554
2,Escort,3290
3,Escorts,2920
4,Sex,1874


In [7]:
df1 = df.copy()

In [9]:
df = df.drop('Total users', axis =1)

In [10]:
df.head()

Unnamed: 0,Search term
0,Massage
1,massage
2,Escort
3,Escorts
4,Sex


In [12]:
df.shape

(87733, 1)

## Search Terms manipulation

In [13]:
list(df['Search term'])

['Massage',
 'massage',
 'Escort',
 'Escorts',
 'Sex',
 'Adult',
 'Brighton',
 'Adult massage',
 'Kittens',
 'Mature',
 'Massage ',
 'Eastbourne',
 'Trans',
 'Crawley',
 'Hastings',
 'Kent',
 'London',
 'Erotic massage',
 'Hove',
 'Canterbury',
 'escort',
 'Worthing',
 'Kitten',
 'Incall',
 'Maidstone',
 'Sofa',
 'Adult services',
 'Bbw',
 'escorts',
 'Asian',
 'adult',
 'Thai',
 'Southampton',
 'Free',
 'Puppies',
 'Milf',
 'Escorts ',
 'Dogs',
 'Portsmouth',
 'Gay',
 'adult massage',
 'Brighton ',
 'Ts',
 'Indian',
 'Double bed',
 'Croydon',
 'Bristol',
 'Eastbourne ',
 'sex',
 'Women',
 'Chichester',
 'Independent',
 'Jumble sale',
 'Shemale',
 'Puppy',
 'Dartford',
 'Liverpool',
 'Aldershot',
 'Rochester',
 'Cars',
 'Chatham',
 'Jobs',
 'Anal',
 'Margate',
 'Male massage',
 'Tv',
 'Mass',
 'Bed',
 'Black',
 'Escort ',
 'Gravesend',
 'Redhill',
 'Tantric',
 'Bareback',
 'Caravan',
 'adult services',
 'Folkestone',
 'Ashford',
 'Es',
 'Sensual massage',
 'Surrey',
 'Cats',
 'Dog',
 '

In [20]:
corpus_temp = []

text = list(df['Search term'])

for i in range(len(text)):
    r = re.sub('[^a-zA-Z]', ' ', str(text[i]))
    r = r.lower()
    r = r.split()
#     r = [word for word in r if word not in stopwords.words('english')]
#     r = [lemmatizer.lemmatize(word) for word in r]
    r = ' '.join(r)
    corpus_temp.append(r)

In [22]:
corpus_temp

['massage',
 'massage',
 'escort',
 'escorts',
 'sex',
 'adult',
 'brighton',
 'adult massage',
 'kittens',
 'mature',
 'massage',
 'eastbourne',
 'trans',
 'crawley',
 'hastings',
 'kent',
 'london',
 'erotic massage',
 'hove',
 'canterbury',
 'escort',
 'worthing',
 'kitten',
 'incall',
 'maidstone',
 'sofa',
 'adult services',
 'bbw',
 'escorts',
 'asian',
 'adult',
 'thai',
 'southampton',
 'free',
 'puppies',
 'milf',
 'escorts',
 'dogs',
 'portsmouth',
 'gay',
 'adult massage',
 'brighton',
 'ts',
 'indian',
 'double bed',
 'croydon',
 'bristol',
 'eastbourne',
 'sex',
 'women',
 'chichester',
 'independent',
 'jumble sale',
 'shemale',
 'puppy',
 'dartford',
 'liverpool',
 'aldershot',
 'rochester',
 'cars',
 'chatham',
 'jobs',
 'anal',
 'margate',
 'male massage',
 'tv',
 'mass',
 'bed',
 'black',
 'escort',
 'gravesend',
 'redhill',
 'tantric',
 'bareback',
 'caravan',
 'adult services',
 'folkestone',
 'ashford',
 'es',
 'sensual massage',
 'surrey',
 'cats',
 'dog',
 'tantr

In [25]:
len(corpus_temp)

87733

### Dropping Duplicates

In [27]:
clean_list = list(set(corpus_temp))

In [28]:
clean_list

['',
 'single bed base with drawers',
 'cast iron post',
 'momen retorts',
 'bengali girl',
 'gravel stones shingle',
 'van vault tipper',
 'rent gay boys',
 'iron gates',
 'bedford properties to rent',
 'purebred yorkshire terrier pups for sale',
 'exercise bike cross trainer',
 'auto car',
 'gi',
 'motor car',
 'stair lifts',
 'boulders for sale',
 'lkj flooring services ltd',
 'philips light bulbs',
 'callaway',
 'cheeky',
 'incqll',
 'pc keyboard',
 'adult bicycles',
 'zimmer frames',
 'loft',
 'knee high boots',
 'kodi boxes',
 'your local oriental spice chinese escort massage service plumstead se',
 'free blowjob',
 'tap push on hose',
 'black francolin',
 'driver warehou',
 'ailia',
 'honda cc',
 'yacht craddle',
 'prudence',
 'skoda rapid',
 'exult',
 'children table and chairs',
 'fxify',
 'south ealing gay',
 'double ended bath x',
 'magical',
 'toyota granvia',
 'skate shoes',
 'garden cart',
 'best if british board game',
 'kids wardrobe',
 'resonator',
 'fire surrounde',
 

## Vectorizer and term_matrix

In [29]:
stpwrd = nltk.corpus.stopwords.words('english')

In [30]:
count_vectorizer = CountVectorizer(stop_words = stpwrd, max_features = 4000)

In [31]:
document_term_matrix = count_vectorizer.fit_transform(clean_list)

In [32]:
document_term_matrix

<65424x4000 sparse matrix of type '<class 'numpy.int64'>'
	with 112630 stored elements in Compressed Sparse Row format>

In [33]:
print('Content after vectorization: \n{}'.format(document_term_matrix))

Content after vectorization: 
  (1, 3160)	1
  (1, 256)	1
  (1, 224)	1
  (1, 1032)	1
  (2, 561)	1
  (2, 1777)	1
  (2, 2685)	1
  (4, 1443)	1
  (5, 1489)	1
  (5, 3343)	1
  (6, 3747)	1
  (6, 3564)	1
  (7, 2866)	1
  (7, 1415)	1
  (7, 389)	1
  (8, 1777)	1
  (8, 1412)	1
  (9, 2866)	1
  (9, 258)	1
  (9, 2734)	1
  (10, 3990)	1
  (10, 3521)	1
  (10, 2757)	1
  (10, 2981)	1
  (11, 1208)	1
  :	:
  (65408, 3305)	1
  (65408, 3325)	1
  (65409, 1674)	1
  (65412, 1349)	1
  (65412, 2083)	1
  (65412, 3837)	1
  (65413, 1329)	1
  (65413, 3645)	1
  (65414, 1174)	1
  (65414, 820)	1
  (65416, 2977)	1
  (65416, 1411)	1
  (65416, 1650)	1
  (65417, 1220)	1
  (65417, 867)	1
  (65417, 2042)	1
  (65418, 3835)	1
  (65419, 3674)	1
  (65419, 1496)	1
  (65420, 565)	1
  (65422, 673)	1
  (65422, 1668)	1
  (65423, 1410)	1
  (65423, 372)	1
  (65423, 2061)	1


## LDA

In [34]:
lda = LatentDirichletAllocation(n_components=5, random_state=42) 
lda.fit(document_term_matrix)

In [35]:
for i,topic in enumerate(lda.components_): 
    print(f'Topic #{i}:') 
    print([count_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')

Topic #0:
['call', 'home', 'double', 'escort', 'girl', 'garden', 'girls', 'bed', 'free', 'escorts']


Topic #1:
['body', 'service', 'electric', 'tv', 'women', 'ford', 'table', 'sex', 'escort', 'massage']


Topic #2:
['sale', 'sussex', 'kittens', 'sofa', 'old', 'jobs', 'black', 'bike', 'puppies', 'male']


Topic #3:
['kitchen', 'automatic', 'van', 'services', 'room', 'chairs', 'cars', 'wanted', 'dog', 'adult']


Topic #4:
['wood', 'machine', 'men', 'caravan', 'bedroom', 'house', 'flat', 'car', 'sale', 'rent']


