# Introduction to the work environment

Let's start by optaining our badge, load some Python packages, dress our uniform, load the data and throw some graphs.

<img src="https://i.imgur.com/RMprgFG.jpeghttps://i.imgur.com/RMprgFG.jpeg"></img>


Load those Python packages until someone is not asking our manager to fire us.

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import warnings
warnings.simplefilter("ignore")
data_df = pd.read_csv("/kaggle/input/i-dont-work-here-lady/reddit_i_don_t_work_here_lady.csv")

Let's quickly look to the data, before any entitled customer enters.

In [None]:
data_df.head()

In [None]:
data_df.info()

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_data(data_df)

Well, we have to look a bit deeper. Take care while doing this, because one suspicious customer might dislike it.

# Take a closer look to the data (but not stare at it)

In [None]:
def show_wordcloud(data, title=""):
    text = " ".join(t for t in data.dropna())
    stopwords = set(STOPWORDS)
    stopwords.update(["t", "co", "https", "amp", "U", "reddit", "comment"])
    wordcloud = WordCloud(stopwords=stopwords, scale=4, max_font_size=50, max_words=500,background_color="black").generate(text)
    fig = plt.figure(1, figsize=(16,16))
    plt.axis('off')
    fig.suptitle(title, fontsize=20)
    fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.show()

In [None]:
show_wordcloud(data_df['title'], title = 'Prevalent words in titles')

In [None]:
show_wordcloud(data_df['body'], title = 'Prevalent words in bodies')

# Make no assumption on what a customer (or redditor) sentiment is

Therefore, let's use some tool to find out. In this case, will be some OTC package.

In [None]:
# borrowed from https://www.kaggle.com/pashupatigupta/sentiments-transformer-vader-embedding-bert
sia = SentimentIntensityAnalyzer()
def find_sentiment(post):
    try:
        if sia.polarity_scores(post)["compound"] > 0:
            return "Positive"
        elif sia.polarity_scores(post)["compound"] < 0:
            return "Negative"
        else:
            return "Neutral"  
    except:
        return "Neutral"

In [None]:
def plot_sentiment(df, feature, title):
    counts = df[feature].value_counts()
    percent = counts/sum(counts)

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

    counts.plot(kind='bar', ax=ax1, color='green')
    percent.plot(kind='bar', ax=ax2, color='blue')
    ax1.set_ylabel(f'Counts : {title} sentiments', size=12)
    ax2.set_ylabel(f'Percentage : {title} sentiments', size=12)
    plt.suptitle(f"Sentiment analysis: {title}")
    plt.tight_layout()
    plt.show()

## Sentiment in title

In [None]:
data_df['title_sentiment'] = data_df['title'].apply(lambda x: find_sentiment(x))
plot_sentiment(data_df, 'title_sentiment', 'Title')

In [None]:
show_wordcloud(data_df.loc[data_df['title_sentiment']=='Positive', 'title'], title = 'Prevalent words in titles (Positive sentiment)')

In [None]:
show_wordcloud(data_df.loc[data_df['title_sentiment']=='Negative', 'title'], title = 'Prevalent words in titles (Negative sentiment)')

In [None]:
show_wordcloud(data_df.loc[data_df['title_sentiment']=='Neutral', 'title'], title = 'Prevalent words in titles (Neutral sentiment)')

## Sentiment in body

In [None]:
data_df['body_sentiment'] = data_df['body'].apply(lambda x: find_sentiment(x))
plot_sentiment(data_df, 'body_sentiment', 'Body')

In [None]:
show_wordcloud(data_df.loc[data_df['body_sentiment']=='Positive', 'title'], title = 'Prevalent words in body (Positive sentiment)')

In [None]:
show_wordcloud(data_df.loc[data_df['body_sentiment']=='Negative', 'title'], title = 'Prevalent words in body (Negative sentiment)')

In [None]:
show_wordcloud(data_df.loc[data_df['body_sentiment']=='Neutral', 'title'], title = 'Prevalent words in body (Neutral sentiment)')

# Let's not jump to conclusions

We will not draw any conclusions here, we try to avoid any misinterpretation. Seeing that the sentiment is dominantly neutral is great. Probably it is not only by chance that Karen appears nowadys associated with all those negative sentiments. It will take some time.

<img src="https://i.pinimg.com/originals/ed/fc/8f/edfc8f66aa46e37c4c02d94a41e16e91.jpg"></img>