# Clean Comments

In [8]:
import pandas as pd
import spacy
import spacy_fastlang
from deep_translator import GoogleTranslator
import ast

In [9]:
df = pd.read_csv('../data/airbnb_rental_prices_combined.csv', sep=';', index_col="Unnamed: 0")

In [10]:
#only use rating_overall & comments
df = df[['rating_overall', 'comments']]
df.head()

Unnamed: 0,rating_overall,comments
0,4.28,['Zum Übernachten optimal für eine Person.\nFü...
1,0.0,[]
6,4.67,['The house is perfectly connected to the cent...
8,0.0,[]
11,0.0,[]


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 701 entries, 0 to 959
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rating_overall  701 non-null    float64
 1   comments        701 non-null    object 
dtypes: float64(1), object(1)
memory usage: 16.4+ KB


In [12]:
df.shape

(701, 2)

In [13]:
df = df[df.astype(str)["comments"] != '[]']
df.shape

(563, 2)

In [14]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [15]:
# convert string array into actual array
df['comments'] = df['comments'].apply(lambda x: ast.literal_eval(x))
# create a row for each string in string array
df = df.explode('comments')
df

Unnamed: 0,rating_overall,comments
0,4.28,Zum Übernachten optimal für eine Person.\nFür ...
0,4.28,Super Gastgeber!\nStudio für das Geld absolut ...
0,4.28,Gut gelegene und günstige Unterkunft mitten in...
0,4.28,"Für Kurzübernachtungen gut, sauber sowohl Zimm..."
0,4.28,naja
...,...,...
959,4.97,An exceptionally stylish and comfortable apart...
959,4.97,"Dear Nick and family, it was a pleasure to hos..."
959,4.97,"Lucas and Claudio are wonderful hosts, very at..."
959,4.97,"Dear Tina, it was a true pleasure to host you,..."


In [16]:
# define methode to translate to german and remove "Mehr anzeigen"
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("language_detector")

def translate_to_german(comment):
    doc = nlp(comment)
    if doc._.language == 'de':
        return comment
    else:
        return GoogleTranslator(source='auto', target='de').translate(comment)



In [17]:
# translate comments to german if they are not already german
df = df['comments'].apply(translate_to_german)
df

0      Zum Übernachten optimal für eine Person.\nFür ...
0      Super Gastgeber!\nStudio für das Geld absolut ...
0      Gut gelegene und günstige Unterkunft mitten in...
0      Für Kurzübernachtungen gut, sauber sowohl Zimm...
0                                                   naja
                             ...                        
959    Eine außergewöhnlich stilvolle und komfortable...
959    Lieber Nick und Familie, es war mir eine Freud...
959    Lucas und Claudio sind wundervolle Gastgeber, ...
959    Liebe Tina, es war mir eine wahre Freude, Sie ...
959    Ich habe meinen Aufenthalt in Lucas‘ Anwesen s...
Name: comments, Length: 4590, dtype: object

In [19]:
df.to_csv('/home/jovyan/raw_comments.csv', sep=';')