# Import Libraries and data

In [None]:
import pandas as pd
from pathlib import Path
import pickle
from wordcloud import WordCloud as wc
import matplotlib.pyplot as plt

In [None]:
path = Path('/kaggle/input/google-quest-challenge')
list(path.iterdir())

# View data

In [None]:
data = pd.read_csv(path/'train.csv', index_col=[0])
data.head()

In [None]:
data.info()

# Categorical columns

In [None]:
data['category'].value_counts().plot.bar(title='Category')

In [None]:
data['host'].value_counts().plot.bar(title='Host')

# Columns with text data

In [None]:
text_cols = ['question_title', 'question_body', 'answer']
text_cols

# Columns with data for tabular model

In [None]:
tab_cols = ['category', 'host']
tab_cols

# Columns with dependant values(targets)

In [None]:
targets = data.columns[10:].tolist()
targets

# Unneeded data (Maybe?)

In [None]:
needed_cols = set(text_cols + tab_cols + targets)
all_cols = set(data.columns)
unneeded_cols = all_cols - needed_cols
unneeded_cols

# Visualizing correlation between targets

In [None]:
corr = data[targets].corr(method='spearman')

In [None]:
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

# Save variables for later use

In [None]:
with open('target.pkl', 'wb') as f: pickle.dump(targets, f)
with open('tab_cols.pkl', 'wb') as f: pickle.dump(tab_cols, f)
with open('text_cols.pkl', 'wb') as f: pickle.dump(text_cols, f)

# Visualize text

## Question text

In [None]:
q_text = ' '.join((data['question_title'] + data['question_body']).tolist())

In [None]:
wcloud = wc().generate(q_text)

In [None]:
plt.figure()
plt.imshow(wcloud)
plt.axis("off")
plt.show()

## Answer text

In [None]:
a_text = ' '.join(data['answer'].tolist())

In [None]:
wcloud = wc().generate(a_text)

In [None]:
plt.figure()
plt.imshow(wcloud)
plt.axis("off")
plt.show()