In [None]:
from UTILS import utils
from pathlib import Path
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp

# Housekeeping

In [None]:
data_dir = Path.cwd().joinpath('OUTPUT')
image_dir = Path.cwd().joinpath('OUTPUT').joinpath('IMAGES')
config_dir = Path.cwd().joinpath('CONFIG')
column_dir = Path.cwd().joinpath('OUTPUT').joinpath('COLUMNS')
report_dir = Path.cwd().joinpath('OUTPUT').joinpath('REPORTING')

# Load the Data

This notebook uses the `df_merged_with_features` dataframe, which was the output of the `preprocessing` notebook.

In [None]:
filename = 'df_features'

with open(str(data_dir.joinpath(filename)), 'rb') as infile:
    df = pickle.load(infile)

# Drop duplicates
df = df.loc[~df.index.duplicated(keep='first')]

In [None]:
# Define the data types of the columns
col_dtype_df = pd.read_csv(
    config_dir.joinpath('mapping_column_types_extended.csv'),
    index_col='columns')
df = df.apply(lambda x: utils.set_column_type2(x, col_dtype_df))

In [None]:
df.dtypes

Add a column for a float type of `student_rating`; this is required for aggregation.

# Ratings vs Blanks

In [None]:
xlabel = ''
ylabel = 'Count'
title = 'Rated vs Not Rated'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = pd.DataFrame({'count': [df.student_rating.isnull().sum(), df.student_rating.notnull().sum()],
                     'type': ['Not rated', 'Rated'],})

ax = sns.barplot(x='type', 
            y='count', 
            data=data)

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title
      )

plt.tight_layout()
plt.savefig(image_path)

Just less than half of the sessions were rated by the students.

# Comments vs Blanks

In [None]:
xlabel = ''
ylabel = 'Count'
title = 'Comment vs No Comment'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = pd.DataFrame({'count': [df.student_comment_word_length.isnull().sum(), df.student_comment_word_length.notnull().sum()],
                     'type': ['No Comment', 'Comment'],})

ax = sns.barplot(x='type', 
            y='count', 
            data=data)

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title
      )

plt.tight_layout()
plt.savefig(image_path)

There are a lot fewer commented sessions than not. This seems to suggest that commenting take a lot more effort.

# Rating vs Comments

## Rating Distributions With Comments

In [None]:
xlabel = 'Student Ratings'
ylabel = 'Count'
title = 'Rating Distributions (Commented)'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df[df.student_comment_word_length > 0]

ax = sns.countplot(x='student_rating', 
            
            data=data)

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
xlabel = 'Student Ratings'
ylabel = 'Count'
title = 'Rating Distributions (Not Commented)'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df[df.student_comment_word_length.isnull()]

ax = sns.countplot(x='student_rating', 
                   data=data)

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title
      )

plt.tight_layout()
plt.savefig(image_path)

## Kolmogorov Smirnov Test

In [None]:
column = 'student_rating'
ratings_w_comments = df[df.student_comment_word_length.notnull()]['student_rating'].dropna()
ratings_wo_comments = df[df.student_comment_word_length.notnull()]['student_rating'].dropna()

In [None]:
ratings_wo_comments.unique()

In [None]:
ks_2samp(ratings_w_comments, ratings_wo_comments)

The high p-value indicates that the two distributions are essentially the same. The conclusion is that whether a student comments or not doesn't affect the rating.

## Relationship Between Rating and Commenting

In [None]:
xlabel = ''
ylabel = 'Count'
title = 'Ratings vs Comments'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = pd.DataFrame({'type': ['Rated, No Comment', 
                              'Not Rated, Commented',
                              'Rated, Commented'],
                     'count': [((df.student_rating_numeric > 0) & (df.student_comment == "")).sum(),
                               ((df.student_rating_numeric.isna()) & (df.student_comment != "")).sum(),
                               ((df.student_rating_numeric > 0) & (df.student_comment != "")).sum(),
                              ]})

ax = sns.barplot(x='type', 
            y='count', 
            data=data)

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title
      )

plt.tight_layout()
plt.savefig(image_path)

plt.show()

# Ratings vs Service by Sex

In [None]:
data = (
    df[['service', 'sex_guess', 'student_rating_numeric']]
    .groupby(['service', 'sex_guess'])
    .mean()
)

data

In [None]:
xlabel = 'Service'
ylabel = 'Average Rating'
title = 'Ratings vs Service by Sex'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (
    df[['service', 'sex_guess', 'student_rating_numeric']]
    .groupby(['service', 'sex_guess'])
    .mean()
    .reset_index()
)

ax = sns.barplot(
    x='service', 
    y='student_rating_numeric', 
    hue='sex_guess',
    data=data,
                )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title
      )

plt.legend(
    bbox_to_anchor=(1.05, 1), 
    loc=2, 
    borderaxespad=0.
)

plt.tight_layout()
plt.savefig(image_path)

plt.show()

# Student Rating Distribution

In [None]:
df.student_rating.value_counts()

In [None]:
xlabel = 'Student Rating'
ylabel = 'Count'
title = 'Distribution of Student Rating'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

ax = sns.barplot(x=df.student_rating.value_counts().index, 
                   y=df.student_rating.value_counts())

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title
      )

plt.tight_layout()
plt.savefig(image_path)

# By `client_id`

See the sensitivity of the `client_id` to the `wait_seconds` for the 5 largest clients by number of sessions.

In [None]:
clients_by_num_sessions = (df
                           .groupby(['service', 'client_id'])
                           .agg({'session_id': 'count',
                                 'student_id': pd.Series.nunique,
                                 'student_rating_float': 'mean',
                                 'student_comment_char_word': 'mean',
                                 'student_sessions_total': 'mean',
                                 'sentiment_aggregated': 'mean',
                                 'tutor_id': pd.Series.nunique,
                                 'tutor_age': 'mean',
                                 'tutor_sessions_total': 'mean',
                                 'tutor_experience_days': 'mean',
                                 
                                })
                           .sort_values(by='session_id', ascending=False)
                           .rename(columns={'session_id':'num_sessions',
                                            'student_rating_float': 'average_student_rating',
                                            'sentiment_aggregated': 'average_sentiment'})
                           .reset_index()
                          )

In [None]:
clients_by_num_sessions.head()

Calculate the correlations between the wait time and the client id.

In [None]:
grouping = ['service', 'client_id']
cols = ['student_rating_fixed_float', 'wait_seconds']

corr_rating_wait = (df
                    .groupby(grouping)[cols]
                    .corr()
                    .reset_index()
                    .query('level_2 == "student_rating_fixed_float"')
                    .drop(labels=['student_rating_fixed_float', 'level_2'], axis='columns')
                    .rename({'wait_seconds': 'corr'})
                   )

corr_rating_wait.head()
corr_rating_wait.shape

Merge with `clients_by_num_sessions` to get the `num_sessions` column.

In [None]:
corr_rating_wait = (corr_rating_wait
                    .merge(clients_by_num_sessions,
                           how='left',
                           on=['service', 'client_id'])
                   )

corr_rating_wait.head()
corr_rating_wait.shape

Merge with `df` to get the `client_type_desc`.

In [None]:
corr_rating_wait = (corr_rating_wait
                    .merge(df[['client_id', 'client_type_desc']]
                           .drop_duplicates(),
                           how='left',
                           on='client_id')
                   )

corr_rating_wait.head()
corr_rating_wait.shape

## CL

Client IDs with the largest number of sessions over the whole period.

In [None]:
corr_rating_wait.query('service == "cl"').sort_values(by='num_sessions', ascending=False).head(10)

## WF

In [None]:
corr_rating_wait.query('service == "wf"').sort_values(by='num_sessions', ascending=False).head(10)

In [None]:
service = 'cl'
top_client_id = (corr_rating_wait
                 .query('service == @service')
                 .sort_values(by='num_sessions', ascending=False)
                 .client_id
                 .head(1)
                 .values[0]
                )

data = (df
        .query('service == @service and client_id == @top_client_id')
       )



# By `client_type_desc`

## Rating vs Waiting Time by `client_type_desc`

Calculate the average `student_rating` and `sentiment_aggregated`.

In [None]:
grouping = ['service', 'client_type_desc']
cols = ['student_rating_fixed_float', 'sentiment_aggregated']


In [None]:
df.g

### CL

In [None]:
service = 'cl'
df_subset = df.query('service == @service')

In [None]:
df_subset.client_type_desc.unique()

In [None]:
grid = sns.FacetGrid(
    df_subset,
    row='client_type_desc',
    aspect=4,
    )

grid = grid.map(
    sns.scatterplot,
    'wait_seconds',
    'student_rating_fixed_float')

# Intents and Topics

In [None]:
order_intent_full = df.query('intent_luis != "None"').intent_luis.value_counts().index

In [None]:
title = 'Count of Intents (excl NONE)'
x_label = 'Count'
y_label = 'Intent'

plt.figure(figsize=(13,5))

ax = sns.countplot(y='intent_luis',
                   data = df.query('intent_luis != "None"'),
                   order = order_intent_full,
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

# Saving
filename = title.replace(' ', '_').replace(':', '').lower() + '.png'
image_path = image_dir.joinpath(filename)
plt.tight_layout()

plt.savefig(image_path)

In [None]:
second_dimension = 'student_rating'
value = 1

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'intent_luis != "None" and {second_dimension} == @value')['intent_luis']

plt.figure(figsize=(13,5))

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

# Saving
filename = title.replace(' ', '_').replace(':', '').lower() + '.png'
image_path = image_dir.joinpath(filename)
plt.tight_layout()
plt.savefig(image_path)

In [None]:
second_dimension = 'student_rating'
value = 2

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'intent_luis != "None" and {second_dimension} == @value')['intent_luis']

plt.figure(figsize=(13,5))

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

# Saving
filename = title.replace(' ', '_').replace(':', '').lower() + '.png'
image_path = image_dir.joinpath(filename)
plt.tight_layout()
plt.savefig(image_path)

In [None]:
second_dimension = 'student_rating'
value = 3

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'intent_luis != "None" and {second_dimension} == @value')['intent_luis']

plt.figure(figsize=(13,5))

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

# Saving
filename = title.replace(' ', '_').replace(':', '').lower() + '.png'
image_path = image_dir.joinpath(filename)
plt.tight_layout()
plt.savefig(image_path)

In [None]:
second_dimension = 'student_rating'
value = 4

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'intent_luis != "None" and {second_dimension} == @value')['intent_luis']

plt.figure(figsize=(13,5))

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

# Saving
filename = title.replace(' ', '_').replace(':', '').lower() + '.png'
image_path = image_dir.joinpath(filename)
plt.tight_layout()
plt.savefig(image_path)

In [None]:
second_dimension = 'student_rating'
value = 5

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'intent_luis != "None" and {second_dimension} == @value')['intent_luis']

plt.figure(figsize=(13,5))

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

# Saving
filename = title.replace(' ', '_').replace(':', '').lower() + '.png'
image_path = image_dir.joinpath(filename)
plt.tight_layout()
plt.savefig(image_path)

## Mapping to the [SERVQUAL](https://en.wikipedia.org/wiki/SERVQUAL) Categories

In [None]:
intent_mapping = pd.read_csv(config_dir.joinpath('mapping_intents.csv'))

In [None]:
intent_mapping.head()

In [None]:
df = utils.add_column(
    df,
    column_dir,
    'intent_luis')

Merge the `intent_luis` with the ... topics.

In [None]:
df = df.merge(
    intent_mapping, 
    how='left',
    on='intent_luis',
    )

In [None]:
df[['intent_luis', 'intent_servqual']].dropna().head()

In [None]:
utils.save_object(
    df.intent_servqual,
    'intent_servqual',
    column_dir,
    )

In [None]:
df.loc[174, ['intent_luis', 'intent_servqual', 'student_comment']]

In [None]:
# Set the order for the overall data set
order = data.intent_client.value_counts().index

In [None]:
xlabel = 'Categories'
ylabel = 'Count'
title = 'Comment Category Distribution'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query('intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order
                  )

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

### Comment Category Distribution by Service

In [None]:
service = 'cl'

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({service.upper()})'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query('service == @service and intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
service = 'wf'

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({service.upper()})'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query('service == @service and intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

### Comment Category Distribution by Rating

In [None]:
filter_var = 'student_rating'
filter_val = 1

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution (Rating: {filter_val})'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var} == {filter_val} and intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var = 'student_rating'
filter_val = 2

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution (Rating: {filter_val})'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var} == {filter_val} and intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var = 'student_rating'
filter_val = 3

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution (Rating: {filter_val})'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var} == {filter_val} and intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var = 'student_rating'
filter_val = 4

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution (Rating: {filter_val})'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var} == {filter_val} and intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var = 'student_rating'
filter_val = 5

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution (Rating: {filter_val})'
filename = title.replace(' ', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var} == {filter_val} and intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

### Comment Category Distribution by Service and Rating

In [None]:
filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 1

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 2

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 3

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 4

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 5

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 1

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 2

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 3

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 4

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 5

xlabel = 'Categories'
ylabel = 'Count'
title = f'Comment Category Distribution ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = df.query(f'{filter_var1} == "{filter_val1}"'
                f' and {filter_var2} == {filter_val2} and'
                f' intent_client != "none"')[['intent_client']]

ax = sns.countplot(y='intent_client',
                   data=data,
                   order=order)

ax.set(xlabel=ylabel,
       ylabel=xlabel,
       title=title,
      )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories

In [None]:
groupby_vars = ['service', 'intent_servqual']

filter_var1 = 'service'
filter_val1 = 'cl'

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
groupby_vars = ['service', 'intent_servqual']

filter_var1 = 'service'
filter_val1 = 'wf'

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: CL, Rating: 1)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 1

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: CL, Rating: 2)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 2

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: CL, Rating: 3)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 3

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: CL, Rating: 4)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 4

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: CL, Rating: 5)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'cl'
filter_var2 = 'student_rating'
filter_val2 = 5

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: WF, Rating: 1)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 1

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: WF, Rating: 2)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 2

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 3

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: WF, Rating: 4)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 4

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

## Average Sentiment Scores by Categories (Service: WF, Rating: 5)

In [None]:
groupby_vars = ['service', 'intent_servqual', 'student_rating']

filter_var1 = 'service'
filter_val1 = 'wf'
filter_var2 = 'student_rating'
filter_val2 = 5

xlabel = 'SERVQUAL Categories'
ylabel = 'Average Sentiment Score'
title = f'Average Sentiment ({filter_var1.title()}: {filter_val1.upper()}, Rating: {filter_val2})'
filename = title.replace(' ', '_').replace(':', '_').lower() + '.png'
image_path = image_dir.joinpath(filename)

data = (df
        .groupby(groupby_vars)['sentiment_aggregated']
        .mean()
        .reset_index()
        .query(f'{filter_var1} == @filter_val1 and {filter_var2} == @filter_val2')
       )

ax = sns.barplot(y='sentiment_aggregated',
                 x='intent_servqual',
                 data=data,
                 )

ax.set(xlabel=xlabel,
       ylabel=ylabel,
       title=title,
      )

ax.set_xticklabels(
    labels=ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
    )

plt.tight_layout()
plt.savefig(image_path)

# Word Cloud

In [None]:
wordcloud_string = ' '.join(list(data_df_comments.student_comment_no_stopwords.values))

In [None]:
wordcloud = WordCloud(background_color="white", 
                      max_words=20, 
                      contour_width=3, 
                      contour_color='steelblue',
                      collocations=False)

In [None]:
wordcloud.generate(wordcloud_string)

In [None]:
wordcloud.to_image()

# Matching Phrases Using `spaCy`

In [None]:
matcher = Matcher(nlp.vocab)

In [None]:
# Create a pattern for something like "did something wrong"
pattern_name = 'DID_SOMETHING_WRONG'
pattern = [{'POS': 'VERB'}, {'POS': 'DET', 'OP': '?'}, {'LOWER': 'wrong'}, {'POS': 'NOUN'}]
matcher.add(pattern_name, None, pattern)

In [None]:
# Create a pattern for something like "pressed the wrong button"
pattern_name = 'PRESSED_WRONG_BUTTON'
pattern = [{'POS': 'VERB'}, {'POS': 'DET', 'OP': '?'}, {'LOWER': 'wrong'}, {'LOWER': 'button'}]
matcher.add(pattern_name, None, pattern)

In [None]:
def get_match_list(doc):
    """Returns a dictionary of {match_pattern: span.text}
    Note: match_pattern is string_id in the official documentation
    """
    
    matches = matcher(doc)
    match_list = []
    for match_id, start, end in matches:
        match_pattern = nlp.vocab.strings[match_id]
        span = doc[start:end]
        match_list.append({match_pattern: span})
    
    return match_list if match_list else False

In [None]:
mask_press_wrong_button = data_df_comments.student_comment_processed.apply(lambda x: True if get_match_list(x) else False)
print(sum(mask_press_wrong_button))

In [None]:
[*zip(data_df_comments.student_comment_processed[mask_press_wrong_button].apply(get_match_list), data_df_comments.student_comment_processed[mask_press_wrong_button])]

In [None]:
data_df_comments[mask_press_wrong_button][['student_comment', 'student_rating', 'start_at']]

In [None]:
sns.countplot(x='service', data=data_df_comments[mask_press_wrong_button])

In [None]:
sns.countplot(x='student_rating', data=data_df_comments[mask_press_wrong_button])

# Sentiment

In [None]:
data_df_comments.groupby('student_rating')['sentiment_textblob'].mean().plot(kind='bar')

### Distribution of Ratings vs Sentiment (TextBlob)

In this section we want to see the distribution of the ratings and the distribution of the sentiment. Note that the plot of the ratings don't include the rows without ratings, so the data for the sentiment is also appropriately subsetted.

In [None]:
title = 'Distribution of Ratings'
sns.distplot(data_df_comments[data_df_comments.student_rating.notna()]['student_rating'],
             kde=False,
             rug=False).set_title(title)

In [None]:
title = 'Distribution of Sentiments (TextBlob)'
sns.distplot(data_df_comments[data_df_comments.student_rating.notna()]['sentiment_textblob'],
             kde=False,
             rug=False).set_title(title)

There are 153 rows which don't have a rating. Let's see the distribution of the sentiments for these rows.

In [None]:
sns.distplot(data_df_comments[data_df_comments.student_rating.isna()]['sentiment_textblob'],
             kde=False,
             rug=True).set_title("Blank Rating: Distribution of TextBlob Sentiment")

The distribution is quite wide from -0.5 to a max of 1.0.

### Rating/Sentiment Inconsistencies `TextBlob`

In [None]:
data_df_comments.query('sentiment_textblob < 0 and student_rating > 3')[['student_rating', 'student_comment_apostrophe', 'sentiment_textblob']]

### `TextBlob` Caveats<a id='textblob-caveats'></a>

In [None]:
test_sentences = ["It's anything but good.",
                  "It's good.",
                  "Extremely helpful.",
                  "Very helpful."]

In [None]:
for sent in test_sentences:
    print(f"Sentence: {sent} \nScore: {TextBlob(sent).sentiment.polarity}")

In [None]:
print(TextBlob("It's anything but good.").sentiment)
print(TextBlob("It's good.").sentiment)
print(TextBlob("Extremely helpful").sentiment)
print(TextBlob("Very helpful").sentiment)

# Aggregated Sentiment Scores by SERVQUAL Categories

In [None]:
cols = [
    'sentiment_textblob', 
    'sentiment_vader',
    'sentiment_luis',
    'sentiment_aggregated',
]

group_cols = [
    'intent_servqual'
]

aggregated_sentiment_total_df = df.groupby(group_cols)[cols].mean()
aggregated_sentiment_total_df

In [None]:
filepath = report_dir.joinpath('aggregated_sentiment_total.csv')
aggregated_sentiment_total_df.to_csv(filepath)

In [None]:
cols = [
    'sentiment_textblob', 
    'sentiment_vader',
    'sentiment_luis',
    'sentiment_aggregated',
]

group_cols = [
    'student_rating',
    'intent_servqual'
]

aggregated_sentiment_df = df.groupby(group_cols)[cols].mean()
aggregated_sentiment_df

# By Student

In [None]:
df.columns

## Number of Unique Students

There are 113411 unique number of students. This averages to about 4.5 sessions per student over the analysis period. Obviously there would be variations as some students would have only used the service once and others multiple times.

In [None]:
df.student_id.nunique()
df.shape[0] / df.student_id.nunique()

### Number of Unique Students by `service`

In [None]:
df_unique = pd.DataFrame({'num_sessions': df.groupby('service')['student_id'].count(),
                          'num_unique_students': df.groupby('service')['student_id'].nunique(),
                          'num_unique_tutors': df.groupby('service')['tutor_id'].nunique()})

df_unique['perc_unique_students'] = df_unique.num_unique_students / df_unique.num_sessions
df_unique['perc_unique_tutors'] = df_unique.num_unique_tutors / df_unique.num_sessions

print(df_unique.transpose())

In [None]:
df_unique

There are slighly higher percentage of unique students in the WF service than in the CL service. In other words, there are more repeat students in WF, though not by much.

For the tutors however, there is a lot more repeats at 0.3% and 0.2% uniqueness for CL and WF respectively.

In [None]:
df_unique=df_unique.reset_index().melt(id_vars=['service'])
df_unique

In [None]:
df_unique['party'] = ['total', 'total', 'students', 'students', 'tutors', 'tutors', 'students', 'students', 'tutors', 'tutors']
df_unique

In [None]:
df_unique['variable'] = df_unique.variable.str.replace('_students', '')
df_unique['variable'] = df_unique.variable.str.replace('_tutors', '')

In [None]:
df_unique

In [None]:
df_unique.query('variable == "perc_unique" and party != "total"')

In [None]:
plot_df = df_unique.query('variable == "perc_unique" and party == "students"')

ax = sns.barplot(x='service', y='value', data=plot_df)
ax.set(title = '% of Unique Students',
       xlabel = 'service',
       ylabel = '')

In [None]:
plot_df = df_unique.query('variable == "perc_unique" and party == "tutors"')

ax = sns.barplot(x='service', y='value', data=plot_df)
ax.set(title = '% of Unique Tutors',
       xlabel = 'service',
       ylabel = '')

## Rating Distribution Per Student

First add a column that is 1 if there is a comment and 0 otherwise.

In [None]:
comment_ind = df.student_comment.apply(lambda x: 1 if len(x) > 0 else 0)

In [None]:
utils.save_object('comment_ind', comment_ind, column_dir)

In [None]:
df = utils.add_column(df, 'comment_ind')

In [None]:
df_unique_students = pd.DataFrame({'num_comments': df.groupby(['student_id'])['comment_ind'].sum(),
                                   'average_num_comments': df.groupby(['student_id'])['comment_ind'].mean(),
                                   'average_comments_word_length': df.groupby(['student_id'])['length_word_comment'].mean(),
                                   'std_comments_word_length': df.groupby(['student_id'])['length_word_comment'].std()})

In [None]:
df_unique_students.head()

Percentage of students who comment:

In [None]:
num_unique_students_commented = df_unique_students.query('num_comments > 0').shape[0]
num_unique_students = df_unique_students.shape[0]
average_students_commented = num_unique_students_commented/num_unique_students

In [None]:
print(f"Number of students who commented:          {num_unique_students_commented}")
print(f"Total number of unique students:           {num_unique_students}")
print(f"Average number of students who commented:  {average_students_commented: .2f}")

In [None]:
sns.distplot(a=df_unique_students.reset_index().query('num_comments > 0')['average_num_comments'],
             kde=False)

# Correlation: Waiting Time vs `student_rating_fixed`

Waiting time has different meanings in CL and WF. In CL it's the time that the student waited to be matched with a tutor; the scale is in seconds. In WF it's the time between submission and the students' receiving the feedback on their document, this can be up to days.

There are {{len(df_merged.client_type_desc.unique())}} different

In [None]:
len(df_merged.client_type_desc.unique())

In [None]:
filter_var = 'service'
filter_val = 'CL'
op = '=='
var1 = 'student_rating'
var2 = 'wait_seconds'
subset_list = [var1, var2]

# cl_df_formatted[subset_list].dropna(subset=['student_rating']).corr()

sns.swarmplot(x=var1, y=var2, data=cl_df_formatted[subset_list].dropna(subset=['student_rating']))

## Writing Feedback Waiting Time vs `student_rating_fixed`

In [None]:
waiting_time_groups = ['service', 'client_type', ]

In [None]:
wf_df_formatted.columns

In [None]:
wf_waiting_time = wf_df_formatted.completed_at - wf_df_formatted.start_at
wf_waiting_time.head()

In [None]:
wf_waiting_time.describe()

Convert the `Timedelta` objects to seconds so it can be joined with the waiting time column of Connect Live.

In [None]:
wf_df_formatted['wait_seconds'] = wf_waiting_time.apply(utils.get_seconds_from_timedelta)

In [None]:
def calc_td_stats(data, func = np.mean):
    return pd.to_timedelta(func(data.values.astype(np.int64)))

In [None]:
wf_df_formatted.groupby('student_rating')['wait_seconds']

In [None]:
data = pd.DataFrame({'mean_wait_time': wf_df_formatted.groupby('student_rating')['wait_seconds'].mean()
                     ,'std_wait_time': wf_df_formatted.groupby('student_rating')['wait_seconds'].std()})

In [None]:
filter_var = 'service'
filter_val = 'WF'
op = '=='
var1 = data.index
var2 = 'mean_wait_time'
subset_list = [var1, var2]

title = f'Average Wait Time vs Student Rating: service = {filter_val}'
x_label = 'Student Rating'
y_label = 'Average Time (Seconds)'

ax = sns.barplot(x=var1
                 ,y=var2
                 , data=data
                )

ax.set(title=title
       ,xlabel=x_label
       ,ylabel=y_label)

In [None]:
filter_var = 'service'
filter_val = 'WF'
op = '=='
var1 = data.index
var2 = 'std_wait_time'
subset_list = [var1, var2]

title = f'Standard Deviation Wait Time vs Student Rating: service = {filter_val}'
x_label = 'Student Rating'
y_label = 'Average Time (Seconds)'

ax = sns.barplot(x=var1
                 ,y=var2
                 , data=data
                )

ax.set(title=title
       ,xlabel=x_label
       ,ylabel=y_label)

## Connect Live Waiting Time vs `student_rating_fixed`

In [None]:
data = pd.DataFrame({'mean_wait_time': cl_df_formatted.groupby('student_rating')['wait_seconds'].mean()
                     ,'std_wait_time': cl_df_formatted.groupby('student_rating')['wait_seconds'].std()})

In [None]:
filter_var = 'service'
filter_val = 'CL'
op = '=='
var1 = data.index
var2 = 'mean_wait_time'
subset_list = [var1, var2]

title = f'Average Wait Time vs Student Rating: service = {filter_val}'
x_label = 'Student Rating'
y_label = 'Average Time (Seconds)'

ax = sns.barplot(x=var1
                 ,y=var2
                 , data=data
                )

ax.set(title=title
       ,xlabel=x_label
       ,ylabel=y_label)



In [None]:
filter_var = 'service'
filter_val = 'CL'
op = '=='
var1 = data.index
var2 = 'std_wait_time'
subset_list = [var1, var2]

title = f'Standard Deviation Wait Time vs Student Rating: service = {filter_val}'

ax = sns.barplot(x=var1
                 ,y=var2
                 , data=data
                )

ax.set(title=title
       ,xlabel=x_label
       ,ylabel=y_label)

# Intents

In [None]:
df.query('luis_intent_pickle != "None"').luis_intent_pickle.value_counts().index

In [None]:
order_intent_full = df.query('luis_intent_pickle != "None"').luis_intent_pickle.value_counts().index

In [None]:
title = 'Count of Intents (excl NONE)'
x_label = 'Count'
y_label = 'Intent'

ax = sns.countplot(y='luis_intent_pickle'
                   ,data = df.query('luis_intent_pickle != "None"')
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

# Saving
filename = title.replace(' ', '_').replace(':', '').lower() + '.png'
image_path = image_dir.joinpath(filename)
plt.tight_layout()
plt.savefig(image_path)

## Intents by Sex

In [None]:
sex = 'male'

title = f'Count of Intents (excl NONE): {sex}'
x_label = 'Count'
y_label = 'Intent'

data = df.query('luis_intent_pickle != "None" and gender_guess_mfu == @sex')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
sex = 'female'

title = f'Count of Intents (excl NONE): {sex}'
x_label = 'Count'
y_label = 'Intent'

data = df.query('luis_intent_pickle != "None" and gender_guess_mfu == @sex')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
sex = 'unknown'

title = f'Count of Intents (excl NONE): {sex}'
x_label = 'Count'
y_label = 'Intent'

data = df.query('luis_intent_pickle != "None" and gender_guess_mfu == @sex')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

## Intents by Rating

In [None]:
second_dimension = 'student_rating'
value = 1

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} == @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
second_dimension = 'student_rating'
value = 2

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} == @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
second_dimension = 'student_rating'
value = 3

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} == @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
second_dimension = 'student_rating'
value = 4

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} == @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
second_dimension = 'student_rating'
value = 5

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} == @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

## Intents by Service

In [None]:
second_dimension = 'service'
value = 'CL'

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} == @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
second_dimension = 'service'
value = 'WF'

title = f'Count of Intents (excl NONE): {second_dimension} = {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} == @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

# Word Cloud

In [None]:
wordcloud_string = ' '.join(list(data_df_comments.student_comment_no_stopwords.values))

In [None]:
wordcloud = WordCloud(background_color="white", 
                      max_words=20, 
                      contour_width=3, 
                      contour_color='steelblue',
                      collocations=False)

In [None]:
wordcloud.generate(wordcloud_string)

In [None]:
wordcloud.to_image()

### Wordcloud by Rating 

In [None]:
def generate_wordcloud(data: pd.DataFrame, rating: int = None) -> WordCloud:
    
    
    if rating is None:
        subset_df = data
    else:
        subset_df = data.query('student_rating == @rating')
        
    wordcloud_string = ' '.join(list(subset_df.student_comment_no_stopwords.values))
    wordcloud = WordCloud(background_color="white", 
                          max_words=20, 
                          contour_width=3, 
                          contour_color='steelblue',
                          collocations=False)
    
    return wordcloud.generate(wordcloud_string)

In [None]:
generate_wordcloud(data = data_df_comments, rating = 1).to_image()

In [None]:
generate_wordcloud(data = data_df_comments, rating = 2).to_image()

In [None]:
generate_wordcloud(data = data_df_comments, rating = 3).to_image()

In [None]:
generate_wordcloud(data = data_df_comments, rating = 4).to_image()

In [None]:
generate_wordcloud(data = data_df_comments, rating = 5).to_image()

There seems to be a lot of "feedback". Let's see what the actual context is.

In [None]:
data_df_comments[data_df_comments.student_comment.str.contains('feedback')][['student_rating', 'student_comment']]

# ngrams (Combined CL and WF)

In [None]:
wordcloud = WordCloud(max_words = 8, background_color='white')

### Remove Punctuation and Stopwords

In [None]:
data_df_comments['student_comment_nopunct'] = data_df_comments.student_comment_processed.apply(lambda x: ' '.join([token.orth_.lower() for token in x if not token.is_punct]))

In [None]:
data_df_comments['student_comment_nopunct_nostopwords'] = data_df_comments.student_comment_processed.apply(lambda x: ' '.join([token.orth_.lower() for token in x if not token.is_stop and not token.is_punct]))

In [None]:
def create_ngram_dict(text_col: pd.Series, n: int) -> defaultdict:
    """Create a, n-word frequency dictionary"""
    ngram_dict = defaultdict(int)
    for text in text_col:
        tokens = word_tokenize(text)
        for ngram in ngrams(tokens, n):
            key = ' '.join(ngram)
            ngram_dict[key] += 1
        
    return ngram_dict

In [None]:
def ddict_to_df(ddict):
    """Converts a defaultdict of frequencies to a pandas dataframe"""
    name_list = []
    freq_list = []
    for key, value in ddict.items():
        name_list.append(key)
        freq_list.append(value)
    
    ngram_df = pd.DataFrame({'word': name_list, 'frequency': freq_list})
    ngram_df.sort_values(by = 'frequency', ascending = False, inplace = True)
    
    return ngram_df

Create a function to produce the ngram frequencies and charts.

In [None]:
def create_ngram(df, ngram, rating, service):
    """Subset the data and produce the word frequency barchart"""
    
    if rating and service:
        if ngram == 1:
            comments = df.query('student_rating == @rating and service == @service').student_comment_nopunct_nostopwords
        else:
            comments = df.query('student_rating == @rating and service == @service').student_comment_nopunct
    elif rating and not service:
        if ngram == 1:
            comments = df.query('student_rating == @rating').student_comment_nopunct_nostopwords
        else:
            comments = df.query('student_rating == @rating').student_comment_nopunct
    elif not rating and service:
        if ngram == 1:
            comments = df.query('service == @service').student_comment_nopunct_nostopwords
        else:
            comments = df.query('service == @service').student_comment_nopunct
    else:
        if ngram == 1:
            comments = df.student_comment_nopunct_nostopwords
        else:
            comments = df.student_comment_nopunct
        
    ngram_freq = create_ngram_dict(comments, ngram)
    wordcloud.generate_from_frequencies(ngram_freq)
    wordcloud.to_image()

    ngram_df = ddict_to_df(ngram_freq)
    
    def map_string(ngram):
        result = None
        if ngram == 1:
            return 'Unigram'
        elif ngram == 2:
            return 'Bigram'
        elif ngram == 3:
            return 'Trigram'
        elif ngram == 4:
            return 'Four-gram'
        return result
    
    title = f'{map_string(ngram)} Rating: {rating} {service}'
    ax = sns.barplot(x='frequency', y='word', data=ngram_df.head(10))
    ax.set_title(title)
    plt.show()

The following section loops through:
- ngrams 1-3
- rating 1-5
- service CL and WF

## Unigrams

In [None]:
ngram = 1

for rating, service in product(range(1, 6), ('CL', 'WF')):
    create_ngram(df = data_df_comments, ngram = ngram, rating = rating, service = service)

## Bigrams

In [None]:
ngram = 2

for rating, service in product(range(1, 6), ('CL', 'WF')):
    create_ngram(df = data_df_comments, ngram = ngram, rating = rating, service = service)

## Trigrams

In [None]:
ngram = 3

for rating, service in product(range(1, 6), ('CL', 'WF')):
    create_ngram(df = data_df_comments, ngram = ngram, rating = rating, service = service)

## Four-grams

In [None]:
ngram = 4

for rating, service in product(range(1, 6), ('CL', 'WF')):
    create_ngram(df = data_df_comments, ngram = ngram, rating = rating, service = service)

## Intents by Sentiment

In [None]:
second_dimension = 'sentiment_aggregated'
value = 0
operator = '<='
op_dict = {'==': 'is'
           ,'<': 'is less than'
           ,'>': 'is greater than'
           ,'<=': 'is less than or equal to'
           ,'>=': 'is greater than or equal to'
          }

title = f'Count of Intents (excl NONE): {second_dimension.title()} {op_dict[operator]} {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} {operator} @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
second_dimension = 'sentiment_aggregated'
value = 0
operator = '>'
op_dict = {'==': 'is'
           ,'<': 'is less than'
           ,'>': 'is greater than'
           ,'<=': 'is less than or equal to'
           ,'>=': 'is greater than or equal to'
          }

title = f'Count of Intents (excl NONE): {second_dimension.title()} {op_dict[operator]} {value}'
x_label = 'Count'
y_label = 'Intent'

data = df.query(f'luis_intent_pickle != "None" and {second_dimension} {operator} @value')['luis_intent_pickle']

ax = sns.countplot(y=data
                   ,order = order_intent_full
                  )

ax.set(xlabel=x_label
       ,ylabel=y_label
       ,title=title)

In [None]:
filepath = report_dir.joinpath('aggregated_sentiment_rating_vs_servqual.csv')
aggregated_sentiment_df.to_csv(filepath)

# Correlations

In [None]:
# Reorder the columns so that 'student_rating_numeric' is the first.
columns = (
    ['student_rating_numeric'] 
    + [col for col in df.columns if col != 'student_rating_numeric']
)

In [None]:
corr_df = df.loc[:, columns].corr()

In [None]:
f = plt.figure(figsize=(19, 15))
sns.heatmap(corr_df)

title = "Correlations"
filename = title + '.png'

plt.title(title)
plt.savefig(image_dir.joinpath(filename))

In [None]:
# Enumerate the column names
f = plt.figure(figsize=(19, 15))

enumerated_columns = range(len(corr_df.index))

sns.heatmap(
    corr_df,
    xticklabels=enumerated_columns,
    yticklabels=enumerated_columns,
)

title = "Correlations"
filename = title + '_unlabeled.png'

plt.title(title)
plt.savefig(image_dir.joinpath(filename))

## Categorical

In [None]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [None]:
import importlib
importlib.reload(utils)