In [1]:
import os
import json
from datetime import datetime
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_obj=SentimentIntensityAnalyzer()
DATA_DIR = 'data/'
files=os.listdir(DATA_DIR)
files

['Amazon.txt',
 'Berkshire-Hathaway.txt',
 'Blackrock.txt',
 'Cisco-Systems.txt',
 'Facebook.txt',
 'Johnson-and-Johnson.txt',
 'Microsoft.txt',
 'Sony.txt']

In [2]:
all_data=[]
with open(DATA_DIR+'Amazon.txt',encoding='utf-8') as f:
    for line in f:
        data=json.loads(line)
        data['rating']=float(data['rating'])
        data['company']='Amazon'
        data['datePosted']=datetime.strptime(data['authorInfo'].split('-')[0].strip(),'%d %b %Y')
        data['authorProfession']=data['authorInfo'].split('-')[1].strip()
        all_data.append(data)
df=pd.DataFrame(all_data)
print(df.shape)
df.head()

(72210, 9)


Unnamed: 0,rating,pros,cons,adviceManagement,authorInfo,authorLocation,company,datePosted,authorProfession
0,5.0,Love the perks of the company,No Cons as such in mind,,20 Jun 2021 - Content Analyst,Bangalore,Amazon,2021-06-20,Content Analyst
1,3.0,"15/hr plus extra shifts, which equates to a hi...","Physically demanding, meaning you can get inju...",,20 Jun 2021 - Associate,"Langhorne, PA",Amazon,2021-06-20,Associate
2,5.0,Good work and life style,Nothing much to mention here,,20 Jun 2021 - Transaction Risk Investigator (T...,Bangalore,Amazon,2021-06-20,Transaction Risk Investigator (TRMS)
3,4.0,Good for developing a strong CV and learning e...,Low salary for the workload and life expenses,,20 Jun 2021 - Account Manager,,Amazon,2021-06-20,Account Manager
4,5.0,Freedom to pick and choose battles,"Huge teams, sometimes it can become ambiguous",,20 Jun 2021 - Designer,New Delhi,Amazon,2021-06-20,Designer


In [3]:
def apply_sentiment(text):
    sentiment_dict=sid_obj.polarity_scores(text)
    return sentiment_dict['compound']

def search_reviews(sWords, matchPhrase=True, ignoreCase=True):
    if matchPhrase==False:
        sWords='|'.join([word for word in sWords.split()])
    df_subset=df[(df['pros'].str.contains(sWords,case=ignoreCase)==True)|(df['cons'].str.contains(sWords,case=ignoreCase)==True)]
    return df_subset

In [4]:
df['proSentiment']=df['pros'].apply(lambda x:apply_sentiment(x))
df['conSentiment']=df['cons'].apply(lambda x:apply_sentiment(x))
df.head()

Unnamed: 0,rating,pros,cons,adviceManagement,authorInfo,authorLocation,company,datePosted,authorProfession,proSentiment,conSentiment
0,5.0,Love the perks of the company,No Cons as such in mind,,20 Jun 2021 - Content Analyst,Bangalore,Amazon,2021-06-20,Content Analyst,0.6369,-0.296
1,3.0,"15/hr plus extra shifts, which equates to a hi...","Physically demanding, meaning you can get inju...",,20 Jun 2021 - Associate,"Langhorne, PA",Amazon,2021-06-20,Associate,-0.1027,-0.1779
2,5.0,Good work and life style,Nothing much to mention here,,20 Jun 2021 - Transaction Risk Investigator (T...,Bangalore,Amazon,2021-06-20,Transaction Risk Investigator (TRMS),0.4404,0.0
3,4.0,Good for developing a strong CV and learning e...,Low salary for the workload and life expenses,,20 Jun 2021 - Account Manager,,Amazon,2021-06-20,Account Manager,0.7351,-0.2732
4,5.0,Freedom to pick and choose battles,"Huge teams, sometimes it can become ambiguous",,20 Jun 2021 - Designer,New Delhi,Amazon,2021-06-20,Designer,0.3818,0.3182


In [5]:
df_temp=search_reviews('diversity')
print(df_temp.shape)
df_temp.head()

(320, 11)


Unnamed: 0,rating,pros,cons,adviceManagement,authorInfo,authorLocation,company,datePosted,authorProfession,proSentiment,conSentiment
308,4.0,"Steady Demand, potential for advancement, stoc...",Relatively low pay due to stock options. Lots ...,,15 Jun 2021 - Engineer Operations Technician,,Amazon,2021-06-15,Engineer Operations Technician,-0.128,-0.3612
823,4.0,Lots of ownership; ability to learn to be a ma...,long hours; stressful projects and not too muc...,,9 Jun 2021 - Area Manager,"Elizabeth, NJ",Amazon,2021-06-09,Area Manager,0.7506,-0.5106
840,4.0,Lots of ownership; ability to learn to be a ma...,long hours; stressful projects and not too muc...,,9 Jun 2021 - Area Manager,"Elizabeth, NJ",Amazon,2021-06-09,Area Manager,0.7506,-0.5106
1263,5.0,"Great working experience, huge diversity",You have to work hard,,5 Jun 2021 - Anonymous Employee,,Amazon,2021-06-05,Anonymous Employee,0.7506,-0.1027
1273,5.0,"Great working experience, huge diversity",You have to work hard,,5 Jun 2021 - Anonymous Employee,,Amazon,2021-06-05,Anonymous Employee,0.7506,-0.1027


In [27]:
sample_size=5
temp=df.sample(sample_size)
for i in range(sample_size):
    print(temp.iloc[i]['rating'])
    print(temp.iloc[i]['pros'])
    print(temp.iloc[i]['proSentiment'])
    print(temp.iloc[i]['cons'])
    print(temp.iloc[i]['conSentiment'])
    print("________________________________")

5.0
Amazon principles and ethics are well-suited to finding jobs within the company or without. The high expectations placed on you as an associate will push you to excel, should you wish to do more than just "have a job", they can help you "have a career"
0.8442
None I can think of
0.0
________________________________
4.0
Work culture,  lots of  benefits for employees
0.3818
-There aren't any cons
0.0
________________________________
5.0
1. Transparency and communication at every level
2. Very talented senior engineers to work with
3. Internal mobility in case you want to change teams
4. Competitive environment to up-skill
0.6801
1. Communication and strategic alignment between teams can get better since we all serve customers
0.4404
________________________________
4.0
Good management, culture and rewards hard work.
0.6808
Hard to have a social life if you work nights
-0.1027
________________________________
5.0
1) Impactful projects.
2) Learning new things on a regular basis.
3

In [7]:
from gensim.models import Word2Vec
model=Word2Vec.load('word2vec.model_50_3_1')

In [36]:
word_dict={}
nidhi_words=['Gender','Asian','Indian','White','Male','Female',
             'Latin','Diversity','Equity',
             'Inclusion','Stereotype','Equality']
for word in nidhi_words:
    items=model.wv.most_similar(word.lower(),topn=5)
    for item in items:
        word_dict[item[0]]=1
len(word_dict)

54

In [37]:
model.wv.most_similar('female',topn=5)

[('male', 0.795523464679718),
 ('seasoned', 0.7916832566261292),
 ('veteran', 0.7370079755783081),
 ('incapable', 0.7367067337036133),
 ('minorities', 0.7150292992591858)]

In [38]:
word_dict

{'racial': 1,
 'fairness': 1,
 'discrimination': 1,
 'privacy': 1,
 'compliance': 1,
 'stuffed': 1,
 'predominantly': 1,
 'spanish': 1,
 'submissive': 1,
 'pan': 1,
 'male': 1,
 'chinese': 1,
 'elite': 1,
 'australian': 1,
 'pms': 1,
 'badges': 1,
 'men': 1,
 'golden': 1,
 'males': 1,
 'female': 1,
 'veteran': 1,
 'white': 1,
 'seasoned': 1,
 'incapable': 1,
 'minorities': 1,
 'emea': 1,
 'blr': 1,
 'de': 1,
 'fortress': 1,
 'otc': 1,
 'inclusion': 1,
 'teamwork': 1,
 'empowerment': 1,
 'transparency': 1,
 'professionalism': 1,
 'awards': 1,
 'valuation': 1,
 'grants': 1,
 'backloaded': 1,
 'offerings': 1,
 'openness': 1,
 'diversity': 1,
 'cultural': 1,
 'inexcusable': 1,
 'investigators': 1,
 'slowest': 1,
 'hightech': 1,
 'beauty': 1,
 'brilliance': 1,
 'honesty': 1,
 'sexism': 1,
 'payroll': 1,
 'messaging': 1,
 'piss-poor': 1}