# Employee Review Analysis
## DATA200 Project by Harshita Nagasubramanian and Vishal Srikanth

### Introduction
This project is an exploratory analysis of Walmart Employee reviews and aims to find the general consensus of Walmart's work environment. When analyzing employee reviews, taken into consideration are the employment status of a Walmert employee (former/current), the location of the store (specificity can vary), a numerical rating (1-5), a written review and the date at which the review was posted. 


All the information is sourced from publicly available data. 


### Research Question


### Additional Inference


### Importing the Data

In [None]:
#%pip install nltk matplotlib wordcloud
#%pip install PIL
%pip install pandas seaborn numpy scikit-learn
#from PIL import Image
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LinearRegression

reviews=pd.read_csv('walmart_cleaned.csv')
reviews.head(30)

The following is a removal of a redundant column (index) and splitting of the Date column into Year, Month and Day columns

In [None]:
reviews=reviews.drop(columns=['index','date_posted'],axis=1)
reviews.head(30)

The following function is created to categorize the reviews into Pre-Covid, Covid and Post-Covid eras.

In [None]:
def covid_categories(year):
    if year<2020:
        return 'pre-covid era'
    elif 2020<=year<=2021:
        return 'covid era'
    elif year>2021:
        return 'post-covid era'

reviews['covid category']=reviews['year'].apply(covid_categories)
reviews

In [None]:
# Group by the required columns and count the occurrences
reviews_count = reviews.groupby(['rating', 'status', 'covid category']).size().reset_index(name='count')

# Now you should have a 'count' column with the number of occurrences for each group
reviews_count

In [None]:
# Define the new colors for each Covid category and status
covid_status_colors = {
    'pre-covid era': {'Current Employee': 'darkgoldenrod', 'Former Employee': 'palegoldenrod'},
    'covid era': {'Current Employee': 'midnightblue', 'Former Employee': 'cornflowerblue'},
    'post-covid era': {'Current Employee': 'darkred', 'Former Employee': 'lightcoral'}
}

# Create a stacked bar chart
fig, ax = plt.subplots(figsize=(14, 8))

# Plot the bars
width = 0.25  # the width of the bars
ind = np.arange(len(reviews_count['rating'].unique()))  # the x locations for the groups

# Group by 'covid_category' this time
for i, (covid_category, sub_df) in enumerate(reviews_count.groupby('covid category')):
    # Stack by 'status'
    bottom = np.zeros(len(reviews['rating'].unique()))  # reset bottom for each covid category
    for status in ['Current Employee', 'Former Employee']:
        values = sub_df[sub_df['status'] == status]['count'].values
        ax.bar(ind + i * width, values, width, bottom=bottom, color=covid_status_colors[covid_category][status], label=f"{covid_category} ({status})" if bottom.sum() == 0 else "")
        bottom += values

# Set the chart title and labels
ax.set_title('Employee Ratings by Covid Era and Employment Status', fontsize=16)
ax.set_xlabel('Rating', fontsize=14)
ax.set_ylabel('Count', fontsize=14)

# Set the x-ticks and x-tick labels
ax.set_xticks(ind + width)
ax.set_xticklabels(('1', '2', '3', '4', '5'))

# Add a legend
ax.legend(title='Covid Category (Status)')

# Show the plot
plt.tight_layout()
plt.show()

The following is done to make the RegEx analysis easier, all the state names are normalized to only show state abbreviations. In the dataset, some of the locations have state abbreviations and state names, and are inconsistent with their capitalizations. Additionally, reviews with no state specified are removed. 

In [None]:
states = [
    ['Alabama', 'AL'], ['Alaska', 'AK'], ['Arizona', 'AZ'], ['Arkansas', 'AR'], ['California', 'CA'],
    ['Colorado', 'CO'], ['Connecticut', 'CT'], ['Delaware', 'DE'], ['Florida', 'FL'], ['Georgia', 'GA'],
    ['Hawaii', 'HI'], ['Idaho', 'ID'], ['Illinois', 'IL'], ['Indiana', 'IN'], ['Iowa', 'IA'],
    ['Kansas', 'KS'], ['Kentucky', 'KY'], ['Louisiana', 'LA'], ['Maine', 'ME'], ['Maryland', 'MD'],
    ['Massachusetts', 'MA'], ['Michigan', 'MI'], ['Minnesota', 'MN'], ['Mississippi', 'MS'], ['Missouri', 'MO'],
    ['Montana', 'MT'], ['Nebraska', 'NE'], ['Nevada', 'NV'], ['New Hampshire', 'NH'], ['New Jersey', 'NJ'],
    ['New Mexico', 'NM'], ['New York', 'NY'], ['North Carolina', 'NC'], ['North Dakota', 'ND'], ['Ohio', 'OH'],
    ['Oklahoma', 'OK'], ['Oregon', 'OR'], ['Pennsylvania', 'PA'], ['Rhode Island', 'RI'], ['South Carolina', 'SC'],
    ['South Dakota', 'SD'], ['Tennessee', 'TN'], ['Texas', 'TX'], ['Utah', 'UT'], ['Vermont', 'VT'],
    ['Virginia', 'VA'], ['Washington', 'WA'], ['West Virginia', 'WV'], ['Wisconsin', 'WI'], ['Wyoming', 'WY']
]

def state_column(location):
    for state in states:
        if state[0].lower() in location.lower() or state[1].lower() in location.lower():
            return state[1]
    return 'none'

reviews['state']=reviews['location'].apply(state_column)
reviews

In [None]:
none_states = reviews[reviews['state'] == 'none']
none_states

In [None]:
reviews = reviews[reviews['state'] != 'none']
reviews=reviews.drop(columns=['location'], axis=1)
reviews.reset_index(drop=True, inplace=True)
reviews

Next, we take the average rating per state.

In [None]:
average_rating_by_state = reviews.groupby('state')['rating'].mean().reset_index()

# Rename the columns to match your desired new dataset
average_rating_by_state.columns = ['state', 'average rating']

# Print or display the new dataset
average_rating_by_state

In [None]:
%pip install geopandas
import geopandas as gpd
US = gpd.read_file("gz_2010_us_040_00_5m.json")
US

In [None]:
# Define the list of states to be removed
states_to_remove = ['District of Columbia', 'Puerto Rico', 'Alaska', 'Hawaii']  # Add other states as needed

# Function to remove states from the GeoDataFrame
def remove_states(row):
    if row['NAME'] in states_to_remove:
        return None
    else:
        return row

# Apply the function to filter out specified states
US_filtered = US.apply(remove_states, axis=1).dropna()
US_filtered['STATE ABB']=US_filtered['NAME'].apply(state_column)
US_filtered.reset_index(drop=True, inplace=True)

# Print the filtered GeoDataFrame
US_filtered

In [None]:
# Merge the shapefile data with the average rating data
merged_data = US_filtered.merge(average_rating_by_state, how='left', left_on='STATE ABB', right_on='state')

# Increase the figure size here
fig, ax = plt.subplots(1, figsize=(20, 15))  # You can increase these numbers

# Plot the choropleth map
merged_data.plot(column='average rating', cmap='viridis', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

# Set plot title
plt.title('Average Rating by State')

for idx, row in US_filtered.iterrows():
    # Get the centroid of the polygon
    centroid = row['geometry'].centroid
    # Add the state name as text at the centroid
    ax.text(centroid.x, centroid.y, row['STATE ABB'], fontsize=8, ha='center', color='black')

# Remove the axis
ax.axis('off')

# Optional: Adjust the aspect ratio
ax.set_aspect('equal')

# Optional: Adjust the limits if the map is not filling the figure
plt.xlim(-130, -65)  # Adjust these values as needed to fit your desired area
plt.ylim(20, 50)     # Adjust these values as needed to fit your desired area

# Optional: Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Calculate average rating for each state and year combination
average_rating_by_state_year = reviews.groupby(['state', 'year'])['rating'].mean().reset_index()

# Plot actual and predicted average rating against year for each state
plt.figure(figsize=(12, 8))

# Input state
state=input('Enter state: ')
# Filter data for the current state
data = average_rating_by_state_year[average_rating_by_state_year['state'] == state]
    
# Prepare the data for linear regression
X = data[['year']]
y = data['rating']
    
# Initialize the linear regression model
model = LinearRegression()
    
# Fit the model
model.fit(X, y)
    
# Make predictions
y_pred = model.predict(X)
    
# Plot actual ratings
plt.scatter(data['year'], data['rating'], label=f'Actual {state} Ratings')
    
# Plot predicted ratings
plt.plot(data['year'], y_pred, label=f'Predicted {state} Ratings', linestyle='--')

# Labels and title
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.title('Actual vs Predicted Average Rating by State Over Time')
plt.legend()
plt.grid(True)
plt.show()

The inference task for the described bootstrapping analysis involves understanding the variability and uncertainty associated with certain parameters or model predictions based on a given dataset.

In the provided code, we perform two main inference tasks using bootstrapping:

Parameter Estimation: We estimate the mean rating of the dataset using bootstrapping. By repeatedly resampling the dataset with replacement and calculating the mean rating for each resample, we obtain a distribution of mean ratings. From this distribution, we can calculate confidence intervals to understand the range of plausible values for the mean rating. This allows us to infer the uncertainty associated with our estimate of the mean rating.

Model Uncertainty: We assess the uncertainty of a linear regression model's predictions using bootstrapping. By resampling the dataset with replacement, fitting a linear regression model to each resample, and calculating the Mean Squared Error (MSE) for each model, we obtain a distribution of MSE scores. From this distribution, we can calculate confidence intervals to understand the variability in model performance. This allows us to infer the uncertainty associated with the model's predictions.

In summary, the inference task is to quantify and understand the uncertainty and variability associated with certain parameters (mean rating) and model predictions (MSE) based on the given dataset through bootstrapping.

In [None]:

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

# Function to perform bootstrapping and estimate parameter
def bootstrap_parameter_estimation(data, parameter='rating', n_bootstraps=1000):
    parameters = []
    for _ in range(n_bootstraps):
        # Resample data with replacement
        resample = data.sample(n=len(data), replace=True)
        # Calculate parameter of interest (mean rating in this case)
        parameter_value = resample[parameter].mean()
        parameters.append(parameter_value)
    return parameters

# Perform bootstrapping for parameter estimation
bootstrapped_ratings = bootstrap_parameter_estimation(reviews)

# Calculate confidence interval
confidence_interval = np.percentile(bootstrapped_ratings, [2.5, 97.5])
print("95% Confidence Interval for Mean Rating:", confidence_interval)

# Function to perform bootstrapping for model uncertainty
def bootstrap_model_uncertainty(data, n_bootstraps=100):
    mse_scores = []
    for _ in range(n_bootstraps):
        # Resample data with replacement
        resample = data.sample(n=len(data), replace=True)
        # Fit a linear regression model
        X_train = resample[['year']]
        y_train = resample['rating']
        X_test = data[['year']]  # You may want to use a separate test set
        y_test = data['rating']   # You may want to use a separate test set
        model = LinearRegression()
        model.fit(X_train, y_train)
        # Make predictions
        y_pred = model.predict(X_test)
        # Calculate Mean Squared Error
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)
    return mse_scores

# Perform bootstrapping for model uncertainty
bootstrapped_mse = bootstrap_model_uncertainty(reviews)

# Calculate confidence interval
confidence_interval_mse = np.percentile(bootstrapped_mse, [2.5, 97.5])
print("95% Confidence Interval for Model Uncertainty (MSE):", confidence_interval_mse)


In [None]:
# Plot bootstrapped mean ratings
plt.figure(figsize=(10, 6))
plt.hist(bootstrapped_ratings, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.xlabel('Mean Rating')
plt.ylabel('Frequency')
plt.title('Bootstrapped Mean Ratings Distribution')
plt.grid(True)
plt.show()

# Plot bootstrapped model uncertainties (MSE)
plt.figure(figsize=(10, 6))
plt.hist(bootstrapped_mse, bins=30, color='salmon', edgecolor='black', alpha=0.7)
plt.xlabel('Model Uncertainty (MSE)')
plt.ylabel('Frequency')
plt.title('Bootstrapped Model Uncertainty (MSE) Distribution')
plt.grid(True)
plt.show()

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Assuming 'reviews' dataset is defined somewhere in your code

# Prepare features and target variable
X = reviews[['year']]  # Feature
y = reviews['rating']  # Target variable

# Initialize models
linear_regression_model = LinearRegression()
random_forest_model = RandomForestRegressor()

# Hyperparameter tuning using cross-validation
# Linear Regression
linear_regression_scores = cross_val_score(linear_regression_model, X, y, cv=5, scoring='neg_mean_squared_error')
linear_regression_rmse = np.sqrt(-linear_regression_scores.mean())

# Random Forest
random_forest_scores = cross_val_score(random_forest_model, X, y, cv=5, scoring='neg_mean_squared_error')
random_forest_rmse = np.sqrt(-random_forest_scores.mean())

# Model comparison
print("Linear Regression RMSE:", linear_regression_rmse)
print("Random Forest RMSE:", random_forest_rmse)


In [None]:
import matplotlib.pyplot as plt

# Define the number of folds for cross-validation
n_folds = 5

# Plotting
plt.figure(figsize=(10, 6))

# Plot RMSE for each fold and model (Linear Regression and Random Forest)
plt.plot(range(1, n_folds + 1), np.sqrt(-linear_regression_scores), marker='o', label='Linear Regression')
plt.plot(range(1, n_folds + 1), np.sqrt(-random_forest_scores), marker='o', label='Random Forest')

# Labels and title
plt.xlabel('Fold')
plt.ylabel('RMSE')
plt.title('Cross-Validation Results')
plt.xticks(range(1, n_folds + 1))
plt.legend()
plt.grid(True)
plt.show()
