In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import datetime
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import plotly.offline as py
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
py.init_notebook_mode(connected = True)
import plotly.express as px


In [None]:
df = pd.read_csv('../input/glassdoor-prepandemic-dataset-for-usa/Glassdoor USA Dataset.csv')
print('Dataset has ',df.shape[0],' records and ',df.shape[1], ' columns' )
print(' ')
df.head() # head shows the first 5 rows by default

In [None]:
df.drop('Unnamed: 0' , axis = 1,inplace = True)

In [None]:
df.info()

In [None]:
# Some rating entries are not sensible! 
Rting_values = df.Rating.unique()
Rting_values

In [None]:
problematic_entries = []
for i in range(len(Rting_values)):
    try:
        float(Rting_values[i])
    except:
        problematic_entries.append(Rting_values[i])
               
problematic_entries               

### Let's check the whole row for such cases
## Obviously, there is a shift in the columns for these rows

In [None]:
df_shifted = df[df['Rating'].isin(problematic_entries)]

In [None]:
# making a better sense now
df_shifted = df_shifted.shift(periods=2, axis="columns")
df_shifted

## This is the corrected DataFrame now:

In [None]:
df_clean = df[~df['Rating'].isin(problematic_entries)]
df = pd.concat([df_clean, df_shifted])
df

### Now we can successfully convert the Rating column to float:

In [None]:
df['Rating'] = df['Rating'].astype('float')
df['Founded'] = df['Founded'].astype('int')

In [None]:
df.isnull().sum()

In [None]:
df = df.fillna('-1')

In [None]:
df.isnull().sum()

In [None]:
output = []

for col in df.columns:

    nonNull  = len(df) - np.sum(pd.isna(df[col]))
    unique = df[col].nunique()
    colType = str(df[col].dtype)

    output.append([col, nonNull, unique, colType])

output = pd.DataFrame(output)   
output.columns = ['Column','Non-Null', 'unique', 'dtype']
output

In [None]:
# We can format some of the columns as categorical
df['Type of ownership'] = df['Type of ownership'].astype('category')
df['Industry'] = df['Industry'].astype('category')
df['Sector'] = df['Sector'].astype('category')
df['Revenue'] = df['Revenue'].astype('category')
df['Size'] = df['Size'].astype('category')


### Let's check the best rated positions

In [None]:
df = df.sort_values('Rating',ascending = False).reset_index() # Default is ascending
df

In [None]:
df['Company Name'] = df['Company Name'].str.split('\n').str[0]

In [None]:
df.Size.unique()

In [None]:
size_values = list(df.Size.unique())
size_values[5] = '10000 or more employees'

In [None]:
size_values_sorted = sorted(size_values, key=lambda e: int(e.split(' ')[0])  if e != 'Unknown' else 99999999)
size_values_sorted[7] = '10000+ employees'

### Smaller size companies seem to have better ratings: 

In [None]:
sns.displot( 
    data = df,
    x = "Rating",
    hue = "Size",
    hue_order = size_values_sorted,
    kind = "hist",
    aspect = 1.5,
    log_scale = 10,
    palette='hot'
             )

In [None]:
plt.figure(figsize=(10, 8), dpi=80)
box_plot = sns.boxplot(x = 'Size',y = 'Rating',data = df,order = size_values_sorted)
plt.ylabel('Rating')
box_plot.set_xticklabels(box_plot.get_xticklabels(),rotation=30)
plt.xlabel('Size')


ax = box_plot.axes
lines = ax.get_lines()
categories = ax.get_xticks()



for cat in categories:
    # every 4th line at the interval of 6 is median line
    # 0 -> p25 1 -> p75 2 -> lower whisker 3 -> upper whisker 4 -> p50 5 -> upper extreme value
    y = round(lines[cat*6+2].get_ydata()[0],1) 
    y2 = round(lines[cat*6+4].get_ydata()[0],1) 

    ax.text(
        cat, 
        y, 
        f'{y}', 
        ha='center', 
        va='center', 
        fontweight='bold', 
        size=10,
        color='white',
        bbox=dict(facecolor='#445A64'))
    ax.text(
        cat, 
        y2, 
        f'{y2}', 
        ha='center', 
        va='center', 
        fontweight='bold', 
        size=10,
        color='white',
        bbox=dict(facecolor='#445A64'))

box_plot.figure.tight_layout()

fig = box_plot.get_figure()

In [None]:
plt.figure(figsize=(10, 8), dpi=80)
box_plot = sns.boxplot(x = 'Revenue',y = 'Rating',data = df)
plt.ylabel('Rating')
box_plot.set_xticklabels(box_plot.get_xticklabels(),rotation=30)
plt.xlabel('Revenue')


ax = box_plot.axes
lines = ax.get_lines()
categories = ax.get_xticks()



for cat in categories:
    # every 4th line at the interval of 6 is median line
    # 0 -> p25 1 -> p75 2 -> lower whisker 3 -> upper whisker 4 -> p50 5 -> upper extreme value
    y = round(lines[cat*6+2].get_ydata()[0],1) 
    y2 = round(lines[cat*6+4].get_ydata()[0],1) 

    ax.text(
        cat, 
        y, 
        f'{y}', 
        ha='center', 
        va='center', 
        fontweight='bold', 
        size=10,
        color='white',
        bbox=dict(facecolor='#445A64'))
    ax.text(
        cat, 
        y2, 
        f'{y2}', 
        ha='center', 
        va='center', 
        fontweight='bold', 
        size=10,
        color='white',
        bbox=dict(facecolor='#445A64'))

box_plot.figure.tight_layout()

fig = box_plot.get_figure()