In [27]:
import pandas as pd

In [28]:
pd.set_option('display.max_colwidth', 500)

In [29]:
df = pd.read_csv('data/faq.csv', index_col=0)
df.head()

Unnamed: 0,question,answer,source,filename
0,Q. What is AWS AppSync?,"AWS AppSync is a new service that enables developers to manage and synchronize mobile app data in real time across devices and users, but still allows the data to be accessed and altered when the mobile device is in an offline state.",aws,appsync_faqs
1,Q. What application developer languages are supported in AWS AppSync?,"AWS AppSync SDKs support iOS, Android, and JavaScript. The JavaScript support spans web frameworks such as React and Angular as well as technologies such as React Native and Ionic. You can also use open source clients to connect to the AppSync GraphQL endpoint for using other platform such as generic HTTP libraries or even a simple CURL commands.",aws,appsync_faqs
2,Q. Can I make my data real-time with AWS AppSync?,"Yes. Subscriptions are supported with AWS AppSync against any of the data sources, so that when a mutation occurs, the results can be passed down to clients subscribing to the event stream immediately using either MQTT over WebSockets or pure WebSockets.",aws,appsync_faqs
3,Q. What AWS Regions are available for AWS AppSync?,"AWS AppSync is available in different regions around the globe, please refer to the AWS Regions table for more details.",aws,appsync_faqs
4,Q: What is AWS Cloud9?,"AWS Cloud9 is a cloud-based integrated development environment (IDE) that lets you write, run, and debug your code with just a browser. It combines the rich code editing features of an IDE such as code completion, hinting, and step-through debugging, with access to a full Linux server for running and storing code. For more information see our AWS Cloud9 User Guide.",aws,cloud9_faqs


In [30]:
df.source.value_counts()

aws             6793
ibm             1975
gcp              739
hetzner          271
azure            218
yandex_cloud     211
Name: source, dtype: int64

In [31]:
df['clean_question'] = (
    df.question
    .str.replace('Q.', '', regex=False)
    .str.replace('Q:', '', regex=False)
    .str.strip()
    .str.lower()
)

In [32]:
df.shape

(10207, 5)

In [33]:
df = df.drop_duplicates(subset=['clean_question'])
df.shape

(9106, 5)

In [34]:
df.source.value_counts()

aws             6167
ibm             1734
gcp              614
hetzner          270
azure            204
yandex_cloud     117
Name: source, dtype: int64

In [35]:
df['answer_lower'] = df.answer.str.lower().str.replace('a:', '').str.strip()
df = df.drop_duplicates(subset=['answer_lower'])
df.shape

(9017, 6)

In [36]:
df.source.value_counts()

aws             6116
ibm             1723
gcp              604
hetzner          262
azure            200
yandex_cloud     112
Name: source, dtype: int64

In [37]:
df.clean_question.apply(lambda x: len(x)).describe()

count    9017.000000
mean       60.910835
std        27.778333
min        11.000000
25%        42.000000
50%        56.000000
75%        74.000000
max       359.000000
Name: clean_question, dtype: float64

In [38]:
def splitter(x):
    split_str = x.split('.')
    split_str = [part for part in split_str if part]
    if not split_str:
        return ''
    
    if len(split_str) > 1:
        if (
            split_str[0].endswith('yandex') 
            or 
            split_str[0].endswith('aws')
            or
            split_str[0].endswith('azure')
            or
            split_str[0].endswith('hetzner')
            or
            split_str[0].endswith('gcp')
        ):
            return split_str[0] + split_str[1] 
        elif len(split_str[0]) < 20:
            return split_str[0] + ',' + split_str[1]
    
    return split_str[0]
    
    

In [39]:
df['answer_processed'] = df.answer_lower.apply(splitter)

In [40]:
# -> probably won't be understood by bert, as well as russian letters

In [41]:
df = df.drop_duplicates(subset=['answer_processed'])
df.shape

(8942, 7)

In [42]:
df.source.value_counts()

aws             6059
ibm             1706
gcp              604
hetzner          262
azure            200
yandex_cloud     111
Name: source, dtype: int64

In [43]:
df = df[df.answer_processed.apply(lambda x: ':' not in x)]

In [44]:
df.source.value_counts()

aws             5933
ibm             1565
gcp              589
hetzner          259
azure            192
yandex_cloud     109
Name: source, dtype: int64

In [45]:
df = df[(df.clean_question.str.len() + df.answer_processed.str.len()) <= 512]

In [46]:
df.source.value_counts()

aws             5923
ibm             1564
gcp              588
hetzner          259
azure            192
yandex_cloud     109
Name: source, dtype: int64

In [47]:
df.shape

(8635, 7)

In [48]:
df.source.value_counts()

aws             5923
ibm             1564
gcp              588
hetzner          259
azure            192
yandex_cloud     109
Name: source, dtype: int64

In [49]:
final_df = df[['source', 'filename', 'clean_question', 'answer_processed']].rename(columns={"clean_question": "question", "answer_processed": "answer"})
final_df.head()

Unnamed: 0,source,filename,question,answer
0,aws,appsync_faqs,what is aws appsync?,"aws appsync is a new service that enables developers to manage and synchronize mobile app data in real time across devices and users, but still allows the data to be accessed and altered when the mobile device is in an offline state"
1,aws,appsync_faqs,what application developer languages are supported in aws appsync?,"aws appsync sdks support ios, android, and javascript"
2,aws,appsync_faqs,can i make my data real-time with aws appsync?,"yes, subscriptions are supported with aws appsync against any of the data sources, so that when a mutation occurs, the results can be passed down to clients subscribing to the event stream immediately using either mqtt over websockets or pure websockets"
3,aws,appsync_faqs,what aws regions are available for aws appsync?,"aws appsync is available in different regions around the globe, please refer to the aws regions table for more details"
4,aws,cloud9_faqs,what is aws cloud9?,"aws cloud9 is a cloud-based integrated development environment (ide) that lets you write, run, and debug your code with just a browser"


In [50]:
final_df.to_csv('cloud_faq_dataset.csv')

In [51]:
final_df.to_json('cloud_faq_dataset.jsonl', index=True, orient="records", lines=True)