# Homework 4 - Group #24
## 1) Does basic house information reflect house's description?

Our goal is to implement two clustering and compare the results. We create two datasets and each of them will be filled by data that we scraped.

First of all, we import the following libraries.

In [1]:
import time # For time.sleep() method
import pandas as pd
import numpy as np
import requests 
from urllib.request import urlopen
from bs4 import BeautifulSoup

import nltk # To remove stopwords
from nltk.corpus import stopwords
import string # To remove punctuation
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

import io
import json

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robertapassarelli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Scraping
To create the dataset we have to scrape the website of [Immobiliare.it](https://www.immobiliare.it) using **Beautiful Soup** library.

In [2]:
print ("Start : %s" % time.ctime())
numb_pag = 1
df1 = pd.DataFrame(columns=['Price','Locali','Superficie','Bagni','Piano','Descrizione'] )
i = 1

while i <= numb_pag:
    url = "https://www.immobiliare.it/vendita-case/roma/?criterio=rilevanza&pag="+str(i)
    html = urlopen(url)
    time.sleep(3) # to prevent the website block
    soup = BeautifulSoup(html, 'lxml')
    
    all_links = soup.find_all('a')
    
    links = []
    for link in all_links:
        if link.get('href')!= None:
            links.append(link.get("href")) # list with all links

    for item in links:
        if item[-4:] == 'html' and item[0:5] == 'https':
            site = urlopen(item)
            soup = BeautifulSoup(site, 'lxml')
            data = soup.find_all("ul", {'class': "list-inline list-piped features__list"})
            price = soup.find_all("ul", {'class':"list-inline features__price-block"})
            description = soup.find_all("div",{'class':"col-xs-12 description-text text-compressed"})
            try:
                pr = price[0].contents[0].get_text().replace("€","")
                loc = data[0].contents[0].get_text().replace("\xa0","").replace("+", "").replace("locali", "").strip()
                mq = data[0].contents[1].get_text().replace("da ","").replace("\xa0m2","").replace("superficie","").replace("m2","").strip()
                bath = data[0].contents[2].get_text().replace("\xa0","").replace("+", "").replace("bagni", "").strip()
                floor = data[0].contents[3].get_text().replace("\xa0", "").replace("\n","").replace("piano","").replace("T","0").strip()
                # we replace the floor `T` (piano Terra) with the number zero
                descr = description[0].get_text()
                l = [pr, loc, mq, bath, floor, descr]
                # print(l)
                df = pd.DataFrame([l], columns=['Price','Locali','Superficie','Bagni','Piano','Descrizione'])
                df1 = pd.concat([df1, df], ignore_index=True) 
            except:
                pass
    i += 1
print ("End : %s" % time.ctime())

Start : Wed Dec  5 21:55:33 2018
End : Wed Dec  5 21:56:19 2018


We clean the data:
- replace `\n` character in the announcement's description with an empty space
- drop all the announcements that don't have an integer for the `floor`
- remove the point in the price 

In [3]:
df1 = df1[df1['Piano'].apply(lambda x: str(x).isdigit() )]
df1['Descrizione'] = df1['Descrizione'].str.replace(r'\n', ' ', regex=True)

df1['Price'] = df1['Price'].astype(str).str.strip()
df1['Price'] = df1['Price'].str.replace('.', '')

In [4]:
df1.head()

Unnamed: 0,Price,Locali,Superficie,Bagni,Piano,Descrizione
0,225000,2,50,1,1,PAPILLO EUR ...
1,300000,2,46,1,4,Vendesi appa...
2,574000,4,89,2,5,Vendesi appa...
3,500000,3,89,2,3,Vendesi appa...
4,425000,3,72,2,4,Vendesi appa...


Let's save the description in a `.txt` file.

In [5]:
description = list(df1.Descrizione)
with open('data/description.txt', 'w') as file:
     file.write(json.dumps(description))

#### 1) Information
The first matrix `matrix1` is $m_{ij} = value$, where $i \in \{announcement_1, ..., announcement_n\}$ ($n$ is the number of announcement) and $j \in \{price, locali, superficie, bagni, piano \}$.

In [6]:
matrix1 = df1[['Price', 'Locali', 'Superficie', 'Bagni', 'Piano']]
matrix1.head()

Unnamed: 0,Price,Locali,Superficie,Bagni,Piano
0,225000,2,50,1,1
1,300000,2,46,1,4
2,574000,4,89,2,5
3,500000,3,89,2,3
4,425000,3,72,2,4


#### 2) Description
The second matrix `matrix2` is $m_{ij} = \text{tfIdf}_{ij}$ where $i \in \{announcement_1, ..., announcement_n\}$ and $j \in \{word_1, ...,word_m\}$, with $n$ number of the announcements and $m$ is the cardinality of the vocabulary.

In [7]:
# open the `.txt` file with the descriptions
with open('data/description.txt')as f: 
    description = json.load(f)

We clean the descriptions:
- remove stopwords 
- remove punctuation
- stemming

Define a dictionary `clean_descr` with the structure:
- key: number of announcement
- value: cleaned word

In [8]:
stop_words = set(stopwords.words('italian')) # set of stopwords
stemmer = nltk.stem.snowball.ItalianStemmer() # italian stemmer
punctuation = set(string.punctuation) # set of punctuation

clean_descr = {}  

for i, row in df1[:].iterrows():
    # Turn the string in lowercase letter and remove the stopwords and non-alphabetic words
    newrow = row.Descrizione
    text = ([w.lower() for w in list(newrow.split(' ')) 
             if w.lower() not in stop_words and w.isalpha()])
    # To remove punctuation considering character by character
    l = []
    for word in text:
        word = ''.join(ch for ch in word if ch not in punctuation)
        l.append(word)
    # Stem the words and put in a dict
    clean_descr[i] = [stemmer.stem(w) for w in l]

Create`vocabulary` like a dictionary of the words contained in all the documents that maps each word to an integer, with the structure:
- key: word
- value: number

In [9]:
vocabulary = {} # vocabulary as a dictionary
i = 0

for idx in range(len(clean_descr)):
    for word in list(clean_descr.values())[idx]:
        if word not in vocabulary.keys():
            vocabulary[word] = i
            i += 1

In [10]:
with open('data/vocabulary.txt', 'w') as file:
     file.write(json.dumps(vocabulary))

Now we can easily proceed with the `inverted index`, a dictionary with:
- key: number that correspond to the value in the vocabulary
- value: number of document in which there is the word which corresponds to the key.

In [11]:
# inverted index 
inverted_d = {} # empty dict 

for key, value in clean_descr.items():
    for i in sorted(list(set(value))):
        if (vocabulary[i] in inverted_d):
            inverted_d[vocabulary[i]] = inverted_d[vocabulary[i]] + [key]
        else:
            inverted_d[vocabulary[i]] = [key]

In [12]:
with open('data/inverted_d.txt', 'w') as file:
     file.write(json.dumps(inverted_d))

We need to calculate the **TF-IDF** of all the words, that is defined as the "term frequency" times the "inverse document frequency" where:
- "term frequency" is the ratio between the number of the term occurencies in the document and the total number of words in the document;
- "inverse document frequency" is the logarithm of the ratio between the total number of documents and the number of documents containing the term (plus 1 to avoid division by zero).

IDF is indipendent from the specific document, thus we can calculate it once and use it when we need it. Each term will have a single IDF.

In [None]:
# DA QUI NON HO PROVATO! è DEL PRECEDENTE HW, QUALCHE MODIFICA FORSE VA FATTA
idf = {}
n_doc = len(clean_descr)

for term_id, doc in inverted_d.items():
    idf[term_id] = pd.np.log(n_doc/(1+len(doc)))
    
# inverted index 
tfidf_inverted_d = {} # empty dict 

for key, value in clean_descr.items():
    for i in sorted(list(set(value))):
        if (vocabulary[i] in tfidf_inverted_d):
            tfidf_inverted_d[vocabulary[i]] = tfidf_inverted_d[vocabulary[i]] + [(key, (value.count(i)/len(value)) * idf[vocabulary[i]])]
        else:
            tfidf_inverted_d[vocabulary[i]] = [(key, (value.count(i)/len(value)) * idf[vocabulary[i]])]


In [None]:
# QUESTA ERA LA TUA FUNZIONE

#functions defined in order to calculate the tfIdf coefficient
def N(df1):
    #number of documents in the collection
    return len(df1)

def df_t(word, df1):
    #number of documents in the collection that contain a term t
    counter=0
    for lista in df1:
        if word in lista:
            counter+=1
    return counter

def tf_t_d(word,lista):
    #term frequency:number of ocurrence of term t in document d
    return lista.count(word)

def id_f_t(word, df1):
    #inverse document frequency of a term t
    return log(len(df1)/df_t(word, df1),10)

def tf_idf(word,lista,df1):
    #tf_idf of a term in a document of a collection N
    return tf_t_d(word,lista)*id_f_t(word, df1)

def inverted_index_creation_tfIdf(vocabulary, df1):  
    #function that creates inverted_index with coefficient tfIdf from all the documents
    inverted_index={}
    n=1
    
    for lista in df1:          
         #loop for each word of the document create a new key if the word is not in the dictionary 
         #add the number of the document to an existing key is the word is in the dictionary
         #tfIdf coefficient of each term is also added
        for word in set(lista):
    
            index=list(vocabulary.values())
            if vocabulary[word] in list(inverted_index.keys()):
                inverted_index[vocabulary[word]]=(inverted_index[vocabulary[word]])+[("announcement_"+str(n),tf_idf(word,lista,df1))]
            else:
                inverted_index[int(vocabulary[word])]=[("announcement_"+str(n),tf_idf(word,lista,df1))]
        
        n+=1

    with open('data/inverted_index_tfIdf.tsv', 'w') as f1: #write the inverted index in a file called "inverted_index_tfIdf.tsv"
        json.dump(inverted_index, f1)
        f1.close()
    return inverted_index 

