In [1]:
import toLog
log = toLog.log('Categories extraction starting')

In [2]:
from pprint import pprint

In [3]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName('Python spark')
         .config('spark.some.config.option','some-value')
         .getOrCreate()
         )

In [4]:
# read file from hdfs and infer schema
df_cleaned = spark.read.csv("hdfs://masterbig-1.itc.unipi.it:54310/user/student18/df_cleaned.csv", header = True, inferSchema = True)
df_cleaned.count()

505009

In [5]:
from pyspark.mllib.feature import Word2Vec

In [12]:
rdd_tokens = (df_cleaned.select("Review").rdd
              #.sample(False,0.5)
              .filter(lambda x: x['Review'] is not None)
              .map(lambda x: x['Review'].split(" "))
             )

In [14]:
word2Vec = Word2Vec().setMinCount(50).setVectorSize(300).setWindowSize(5)

In [15]:
model = word2Vec.fit(rdd_tokens)

In [25]:
categories = ['breakfast', 'facilities', 'staff', 'room', 'internet', 'location', 'bathroom', 'food']
#food(breakfast), #staff(service), #room, #internet(wi-fi), #location, #bath

In [26]:
all_categories = {cat:dict(model.findSynonyms(cat, num = 20)) for cat in categories}

In [27]:
pprint(all_categories)

{'bathroom': {'bathroom.': 0.522175820356142,
              'bathrooms': 0.5087266509978882,
              'bathtub': 0.5177911604143423,
              'cubicle': 0.5217659655658738,
              'ensuite': 0.5783865873193351,
              'fixture': 0.5384696203033031,
              'flooring': 0.5159286619522134,
              'grouting': 0.5377989416292231,
              'leaky': 0.4985537406377214,
              'restroom': 0.5182082354147269,
              'rusty': 0.5091987232634079,
              'shower': 0.5137119930417334,
              'showerhead': 0.546583679180073,
              'stall': 0.5118656945649197,
              'toilet': 0.5028246081334126,
              'toilette': 0.5177291923428092,
              'washbasin': 0.5583851326054067,
              'washroom': 0.6306056857556726,
              'wc': 0.6125714696608704,
              'wobbly': 0.5305512987867597},
 'breakfast': {'assortment': 0.3352828189113468,
               'beakfast': 0.4665915505150366,
     

In [28]:
merged = {cat: [] for cat in categories}

### cleaning word2vec categories

In [30]:
for cat, features in all_categories.items():
    print("macro categoria: ", cat)
    print(".........")
    for feature in features.keys():
        print(feature)
        answer = input('inserisci feature? y/n: ')
        if answer == 'y':
            merged[cat].append(feature)
            

    
    

macro categoria:  staff
.........
personnel
inserisci feature? y/n: y
doormen
inserisci feature? y/n: y
personel
inserisci feature? y/n: y
concierges
inserisci feature? y/n: y
approachable
inserisci feature? y/n: 
stafff
inserisci feature? y/n: y
respectful
inserisci feature? y/n: n
employees
inserisci feature? y/n: y
hosts
inserisci feature? y/n: y
team
inserisci feature? y/n: y
staffs
inserisci feature? y/n: y
receptionists
inserisci feature? y/n: y
stuff
inserisci feature? y/n: n
hospitable
inserisci feature? y/n: y
staf
inserisci feature? y/n: y
chatty
inserisci feature? y/n: n
personell
inserisci feature? y/n: y
porters
inserisci feature? y/n: y
conceirge
inserisci feature? y/n: y
crew
inserisci feature? y/n: y
macro categoria:  room
.........
washroom
inserisci feature? y/n: n
suite
inserisci feature? y/n: y
studio
inserisci feature? y/n: y
room.
inserisci feature? y/n: y
rooms
inserisci feature? y/n: y
awfull
inserisci feature? y/n: n
appartment
inserisci feature? y/n: y
apartme

In [31]:
merged

{'bathroom': ['toilet',
  'washroom',
  'bathroom.',
  'washbasin',
  'fixture',
  'wc',
  'shower',
  'restroom',
  'showerhead',
  'toilette',
  'bathtub',
  'cubicle',
  'bathrooms',
  'ensuite'],
 'breakfast': ['bfast',
  'breakfest',
  'resteraunt',
  'assortment',
  'breakfeast',
  'breafast',
  'breakfast.',
  'food',
  'brekkie',
  'brekfast',
  'breakfasts',
  'breackfast',
  'beakfast',
  'brakfast'],
 'facilities': ['facilites',
  'facility',
  'supplies',
  'facilties',
  'equipment',
  'facility.',
  'equipments',
  'spa.',
  'ammenities',
  'facilities.',
  'amenities',
  'massages'],
 'food': ['wines',
  'grill',
  'food.',
  'meals',
  'menus',
  'sushi',
  'seafood',
  'menu',
  'foods',
  'breakfast',
  'resturant',
  'resteraunt',
  'steak',
  'dinner.',
  'bistro',
  'selections',
  'menu.',
  'presentation',
  'carvery',
  'brasserie'],
 'internet': ['signal',
  'speed',
  'wlan',
  'internet.',
  'network',
  '3g',
  'signal.',
  'wifi.',
  'connection.',
  '4g',


{'bathroom': ['toilet',
  'washroom',
  'bathroom.',
  'washbasin',
  'fixture',
  'wc',
  'shower',
  'restroom',
  'showerhead',
  'toilette',
  'bathtub',
  'cubicle',
  'bathrooms',
  'ensuite'],
 'breakfast': ['bfast',
  'breakfest',
  'resteraunt',
  'assortment',
  'breakfeast',
  'breafast',
  'breakfast.',
  'food',
  'brekkie',
  'brekfast',
  'breakfasts',
  'breackfast',
  'beakfast',
  'brakfast'],
 'facilities': ['facilites',
  'facility',
  'supplies',
  'facilties',
  'equipment',
  'facility.',
  'equipments',
  'spa.',
  'ammenities',
  'facilities.',
  'amenities',
  'massages'],
 'food': ['wines',
  'grill',
  'food.',
  'meals',
  'menus',
  'sushi',
  'seafood',
  'menu',
  'foods',
  'breakfast',
  'resturant',
  'resteraunt',
  'steak',
  'dinner.',
  'bistro',
  'selections',
  'menu.',
  'presentation',
  'carvery',
  'brasserie'],
 'internet': ['signal',
  'speed',
  'wlan',
  'internet.',
  'network',
  '3g',
  'signal.',
  'wifi.',
  'connection.',
  '4g',


### categories word2vec scritte sul file