In [114]:
import pandas as pd
import random 
from deep_translator import GoogleTranslator
from unidecode import unidecode

In [2]:
df = pd.read_csv('calls.csv')
df.head()

Unnamed: 0,id,occasion,city,act,genre,budget
0,17028,Bruiloft,Geel,Solo,Akoestisch,300
1,17029,Kroegavond,Voorst,Band,Rock,450
2,17030,kroeg,Voorst,Band,Rock,500
3,17031,Mariage,Lyon,Solo,Hardstyle,10
4,17031,Mariage,Lyon,Band,Hardstyle,10


In [89]:
single_act = df['act'][:200]
single_act = pd.DataFrame(single_act)
single_act

Unnamed: 0,act
0,Solo
1,Band
2,Band
3,Solo
4,Band
...,...
195,Ensemble (e.g. duo or trio)
196,Solo
197,Band
198,DJ


In [92]:
single_genre = df['genre'][:3000]
single_genre = pd.DataFrame(single_genre)
single_genre

Unnamed: 0,genre
0,Akoestisch
1,Rock
2,Rock
3,Hardstyle
4,Hardstyle
...,...
2995,Rock 'n Roll
2996,Rock 'n Roll
2997,Latin
2998,Latin


In [3]:
df.count()

id          35440
occasion    26203
city        35440
act         26396
genre       26061
budget      35440
dtype: int64

In [4]:
distinct_count = df['id'].nunique()
distinct_count

12004

In [5]:
df['act'] = df['act'].fillna('')
df['genre'] = df['genre'].fillna('')
df['occasion'] = df['occasion'].fillna('')

In [6]:
aggregated_df = df.groupby('id').agg({
    'occasion': 'first',
    'city': 'first',
    'act': lambda x: ', '.join(set(x)),
    'genre': lambda x: ', '.join(set(x)),
    'budget': 'first'
}).reset_index()

aggregated_df

Unnamed: 0,id,occasion,city,act,genre,budget
0,1,,Eisden (dichtbij Maastricht),,,800 Euro (max)
1,2,,Rotterdam,,,bespreekbaar
2,5,,rotterdam-zuid,,,reiskosten
3,6,,HARLINGEN,,,150-300
4,7,,Amsterdam,,,Ik kan (veel) exposure bieden. Het youtube cha...
...,...,...,...,...,...,...
11999,20436,Bruiloft,Heemstede,Solo,Easy Listening,300
12000,20437,Koningsnacht,Krimpenerwaard,DJ,"House, Nu-Disco, Allround, Deep house, Dance, ...",500
12001,20438,Geburtstag,Neuffen,"Ensemble (e.g. duo or trio), Band, Solo","Komedie, Rock, Rock 'n Roll, Rap, Grunge, Bras...",750
12002,20441,Borrel,Hillegom,Solo,60s,20


### English budget

In [94]:
eng_budget = aggregated_df[~aggregated_df['budget'].str.contains(r'\d')]
eng_budget = eng_budget[['budget']]
eng_budget

Unnamed: 0,budget
1,bespreekbaar
2,reiskosten
14,?
16,Te bespreken
24,zie omschrijving
...,...
8952,In overleg .
8959,?
8967,indefinido
8969,Depende de los días


In [95]:
def translate_to_english(text):
    return GoogleTranslator(source='auto', target='en').translate(text)
eng_budget['budget'].head(10).apply(translate_to_english)

1                                            negotiable
2                                       travel expenses
14                                                 None
16                                           To discuss
24                                      See Description
33                                   Small compensation
34                                  a bottle of wine!!!
35                                      in consultation
37                                     still unknown...
44    It is non-profit, so it would be nice if it co...
Name: budget, dtype: object

In [96]:
# Function to translate text from different languages to English
def translate_to_english(text):
    return GoogleTranslator(source='auto', target='en').translate(text)

# Apply translation to the 'budget' column
eng_budget['budget'] = eng_budget['budget'].apply(translate_to_english)
eng_budget

Unnamed: 0,budget
1,negotiable
2,travel expenses
14,
16,To discuss
24,See Description
...,...
8952,In consultation .
8959,
8967,Undefined
8969,It depends on the days


In [18]:
occasion = aggregated_df['occasion'][aggregated_df['occasion'] != '']
occasion = pd.DataFrame(occasion)
city = aggregated_df['city'][aggregated_df['city'] != '']
city = pd.DataFrame(city)
act = aggregated_df['act'][aggregated_df['act'] != '']
act = pd.DataFrame(act)
genre = aggregated_df['genre'][aggregated_df['genre'] != '']
genre = pd.DataFrame(genre)
budget = aggregated_df['budget'][aggregated_df['budget'] != '']
budget = pd.DataFrame(budget)
budget

Unnamed: 0,budget
0,800 Euro (max)
1,bespreekbaar
2,reiskosten
3,150-300
4,Ik kan (veel) exposure bieden. Het youtube cha...
...,...
11999,300
12000,500
12001,750
12002,20


### English occasion

In [97]:
eng_occasion = aggregated_df[['occasion']]
eng_occasion = eng_occasion['occasion'][eng_occasion['occasion'] != '']
eng_occasion = pd.DataFrame(eng_occasion)
eng_occasion

Unnamed: 0,occasion
588,Privé tuinfeest
7833,Fiestas de Santigo Apostol ( Santiaguiño)
9159,Bruiloft
9160,Kroegavond
9161,kroeg
...,...
11999,Bruiloft
12000,Koningsnacht
12001,Geburtstag
12002,Borrel


In [99]:
eng_occasion['occasion'] = eng_occasion['occasion'].apply(translate_to_english)
eng_occasion

Unnamed: 0,occasion
588,Private garden party
7833,Festivities of Santigo Apostol (Santiaguiño)
9159,Wedding
9160,Bar night
9161,bar
...,...
11999,Wedding
12000,King's night
12001,Birthday
12002,A drink


In [100]:
ans_occasion = ['Cool! Could you tell me where is it?', 
           'Sounds nice! Where is it?', 
           'Exciting! Could you tell me where will it be happening?', 
           'That sounds great! Can you share the location of the event?' ]

ans_city = ['I like this area. Are you looking for a DJ, band, ensemble, or solo artist?',
            'Exctiting! Are you looking for a DJ, band, ensemble or solo artist?',
            'Nice! Are you looking for a DJ, band, ensemble or solo artist?',
            'Sounds promising. Are you looking for a DJ, band, ensemble or solo artist?']

ans_act = ['Good choice. Which music genre are you interested in?',
           'Great. Which music genre are you interested in?',
           'Okay. Which music genre are you interested in?',
           'Sure. Do you have a preference for the music genre?']

ans_genre = ['Okay. And the last question is how much would you like to pay?',
             'Okay! And the final question is how much would you like to pay?',
             'Nice. How much would you like to pay?',
             'Sounds good! How much would you like to pay?']

ans_budget = ['Perfect! Based on your preferences, I\'ll suggest you some artists. Thank you for choosing Gigstarter!',
              'Noted. I\'ll show you some artists that fit your criteria',
              'Sure thing! I\'ll show you some artists that fit your criteria.',
              'Understood. I\'ll show you some artists that fit your criteria.']

In [101]:
eng_budget['Answer'] = [random.choice(ans_budget) for _ in range(len(eng_budget))]
eng_budget.rename(columns={'budget': 'Question'}, inplace=True)

eng_occasion['Answer'] = [random.choice(ans_occasion) for _ in range(len(eng_occasion))]
eng_occasion.rename(columns={'occasion': 'Question'}, inplace=True)

single_act['Answer'] = [random.choice(ans_act) for _ in range(len(single_act))]
single_act.rename(columns={'act': 'Question'}, inplace=True)

single_genre['Answer'] = [random.choice(ans_genre) for _ in range(len(single_genre))]
single_genre.rename(columns={'genre': 'Question'}, inplace=True)

In [102]:
occasion['Answer'] = [random.choice(ans_occasion) for _ in range(len(occasion))]
occasion.rename(columns={'occasion': 'Question'}, inplace=True)
city['Answer'] = [random.choice(ans_city) for _ in range(len(city))]
city.rename(columns={'city': 'Question'}, inplace=True)
act['Answer'] = [random.choice(ans_act) for _ in range(len(act))]
act.rename(columns={'act': 'Question'}, inplace=True)
genre['Answer'] = [random.choice(ans_genre) for _ in range(len(genre))]
genre.rename(columns={'genre': 'Question'}, inplace=True)
budget['Answer'] = [random.choice(ans_budget) for _ in range(len(budget))]
budget.rename(columns={'budget': 'Question'}, inplace=True)

In [103]:
budget

Unnamed: 0,Question,Answer
0,800 Euro (max),Understood. I'll show you some artists that fi...
1,bespreekbaar,Noted. I'll show you some artists that fit you...
2,reiskosten,"Perfect! Based on your preferences, I'll sugge..."
3,150-300,Noted. I'll show you some artists that fit you...
4,Ik kan (veel) exposure bieden. Het youtube cha...,"Perfect! Based on your preferences, I'll sugge..."
...,...,...
11999,300,Sure thing! I'll show you some artists that fi...
12000,500,Understood. I'll show you some artists that fi...
12001,750,"Perfect! Based on your preferences, I'll sugge..."
12002,20,Understood. I'll show you some artists that fi...


In [104]:
combined_df = pd.concat([occasion, city, act, genre, budget, eng_budget, eng_occasion, single_act, single_genre], ignore_index=True)
combined_df

Unnamed: 0,Question,Answer
0,Privé tuinfeest,Sounds nice! Where is it?
1,Fiestas de Santigo Apostol ( Santiaguiño),Exciting! Could you tell me where will it be h...
2,Bruiloft,Cool! Could you tell me where is it?
3,Kroegavond,Sounds nice! Where is it?
4,kroeg,Exciting! Could you tell me where will it be h...
...,...,...
40330,Rock 'n Roll,Nice. How much would you like to pay?
40331,Rock 'n Roll,Sounds good! How much would you like to pay?
40332,Latin,Sounds good! How much would you like to pay?
40333,Latin,Okay. And the last question is how much would ...


In [105]:
qa = pd.read_csv('QA.csv')
qa.head()

Unnamed: 0,Question,Answer
0,its a birthday,Sounds nice! Where is it?
1,It will be hosted at the local park in den Hague.,"I like this area. Are you looking for a DJ, ba..."
2,I'm looking for a solo artist.,Good choice. Which music genre are you interes...
3,I'm interested in classical music.,Nice. How much would you like to pay?
4,I have a budget of $800.,Noted. Check the website. Here are some artist...


In [106]:
qa_full = pd.concat([qa, combined_df], ignore_index=True)
qa_full.head()

Unnamed: 0,Question,Answer
0,its a birthday,Sounds nice! Where is it?
1,It will be hosted at the local park in den Hague.,"I like this area. Are you looking for a DJ, ba..."
2,I'm looking for a solo artist.,Good choice. Which music genre are you interes...
3,I'm interested in classical music.,Nice. How much would you like to pay?
4,I have a budget of $800.,Noted. Check the website. Here are some artist...


In [115]:
def normalize_column(column):
    return column.apply(lambda x: unidecode(str(x)))

# Apply the normalization function to all columns in the DataFrame
qa_full = qa_full.apply(normalize_column)

In [116]:
qa_full.to_csv('data/' + 'QA_full.csv', index=False, encoding='utf-8')