In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from wordcloud import WordCloud 
import gc 
from typing import Dict, List, Tuple, Any

import re 
import nltk 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA 
from sklearn.manifold import TSNE 
from sklearn.cluster import KMeans 

pd.set_option("display.max_columns", None)

In [None]:
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv", nrows=1)
df = df.iloc[:, 1:]
df.head()

Since there are multiple questions that have the same problem content, combine them into one and create a data frame that counts each.

In [None]:
def create_dateframe(df: pd.DataFrame) -> pd.DataFrame:
    question2sentence = {}
    for col in df.columns:
        number = col.split("_")[0][1:]
        if number not in question2sentence:
            question2sentence[number] = df[col].values[0]
        else:
            continue 
    
    question = [c for _, c in question2sentence.items()]
            
    question2count = {}
    for col in df.columns:
        number = col.split("_")[0][1:]
        if number not in question2count:
            question2count[number] = 1 
        else:
            question2count[number] += 1 
            
    cnt = [n for _, n in question2count.items()]
    
    assert len(question) == len(cnt)
    new_df = pd.DataFrame({"question": question, "count": cnt}, index=["Q"+str(i) for i in range(1, 43, 1)])
    return new_df
            

In [None]:
q = create_dateframe(df)
q.head()

Check if there is a correlation between the number of choices and the length of the question.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
ax = axes.ravel()
sns.histplot(q["count"], ax=ax[0])
sns.boxplot(q["count"], ax=ax[1])

q["len"] = q["question"].apply(lambda x: len(x))

sns.histplot(q["len"], ax=ax[2])
sns.scatterplot(data=q, x="count", y="len", ax=ax[3])

q.drop("len", axis=1, inplace=True)
plt.show()

### Obsevation...
* As you can see from the raw data, most of them consist of a single-choice question.  
* It turned out that the number of selections increases in proportion to the length of the sentence as a whole.

In [None]:
%%time 

nltk.download('stopwords')
stop_words = stopwords.words('english')
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

def cln_txt(doc):
    new = []
    for txt in doc.split():
        txt = txt.lower()
        if txt in stop_words:
            continue 
        txt = non_alphanums.sub("", txt)
        if txt == "":
            continue 
        new.append(txt)
    return " ".join(new)

q["question"] = q["question"].apply(cln_txt)


In [None]:
def create_word(df: pd.DataFrame) -> Dict[str, int]:
    word2count = {}
    for doc in df:
        for word in doc.split():
            if word not in word2count:
                if word in ["select", "selected", "use", "apply", "choice"]: 
                    continue 
                word2count[word] = 1 
            else:
                word2count[word] += 1 
    return word2count

word = WordCloud(background_color="white", max_words=20).generate_from_frequencies(create_word(q["question"]))

plt.figure(figsize=(15, 6))
plt.imshow(word)
plt.axis("off")
plt.show()

In [None]:
q["count_rank"] = pd.cut(q["count"], bins=4, labels=False)

fig, axes = plt.subplots(2, 2, figsize=(12, 12))
ax = axes.ravel()

for i, rank in enumerate(q["count_rank"].unique()):
    x = q.loc[q["count_rank"] == rank, ["question"]]
    word = WordCloud(background_color="white", max_words=20).generate_from_frequencies(create_word(x["question"]))
    ax[i].imshow(word)
    ax[i].set_xticks([])
    ax[i].set_yticks([])    
    ax[i].set_title(f"{rank}", c="g")
plt.tight_layout()
    

### Obsevation...
* You can see words related to data analysis.Especially in compact sentences. 
* You can see the word cloud service in sentences with many choices.

In [None]:
# vector
tfidf = TfidfVectorizer()
v = tfidf.fit_transform(q["question"]).toarray()
# PCA
decompose = PCA(n_components=40, random_state=42)
v = decompose.fit_transform(v)
# cluster
km = KMeans(n_clusters=3, random_state=42)
y_km = km.fit_predict(v)
# vizual"
fig = px.scatter(x=v[:, 0], y=v[:, 1], title="decompose question by PCA", color=y_km)
fig.show()

In [None]:
q["cluster"] = y_km 

fig, axes = plt.subplots(1, 3, figsize=(22, 12))
ax = axes.ravel()

for i,cluster in enumerate(range(3)):
    c = q[q["cluster"] == cluster]
    word = WordCloud(background_color="white", max_words=20).generate_from_frequencies(create_word(c["question"]))
    count = c["count"].mean()
    ax[i].imshow(word)
    ax[i].set_xticks([])
    ax[i].set_yticks([])
    ax[i].set_title(f"cluster={cluster} | count={count}", c="g")
    
plt.tight_layout()

### Observation...
* Since there is a large difference in the average number of selections for each cluster, there seems to be a correlation with the words that appear.
* You can distinguish between data analysis terms, cloud terms, and other terms.