In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud

In [None]:
df_reviews = pd.read_csv('../data/reviews.csv')
df_submissions = pd.read_csv('../data/Submissions.csv')
df_dec = pd.read_csv('../data/decision.csv')
df_keyword = pd.read_csv('../data/submission_keyword.csv')

In [None]:
df_submissions

In [None]:
df_submissions.head()

In [None]:
df_reviews.head()

In [None]:
df_dec.head()

In [None]:
#get the average review score
df_average_review_score = df_reviews.groupby('forum')['rating_int'].mean().reset_index()
df_average_review_score

# Looking at Review Length and Score & Decision

In [None]:
#Step 1 - Combine Review DF and Decision DF

df_rev_dec = pd.merge(df_reviews, df_dec, left_on='forum', right_on='forum', how = 'inner')[['review','decision','conf_name_y','rating_int','forum']]

df_rev_dec.head()

## Text Cleaning or Word Normalization

In [None]:
#case folding

df_rev_dec['review'] = df_rev_dec['review'].str.lower()

In [None]:
df_rev_dec['decision'].unique()

In [None]:
#clean up decision text

def clean_up_decision(text):
    if text in ['Accept (Poster)','Accept (Spotlight)', 'Accept (Oral)','Accept (Talk)']:
        return 'Accept'
    else:
        return text

    
df_rev_dec['decision_clean'] = df_rev_dec['decision'].apply(clean_up_decision) 

## Number of Sentences Per Review

In [None]:
def sentence_count(text):
    return len(sent_tokenize(text))


df_rev_dec['sent_count'] = df_rev_dec['review'].apply(sentence_count)

In [None]:
df_rev_dec.head()

In [None]:
ax = sns.histplot(data=df_rev_dec, x='sent_count',
                  y='rating_int',
                  hue='decision_clean',
                kde=True, 
                  log_scale=(True,False),
                 legend=True)

# Set axis labels
ax.set(xlabel='Submission Length (# Sentences)', ylabel='Review Rating')

# Move the legend outside the plot
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.show()

## Number of Tokens Per Review

- Text Preprocessing: It converts raw text data into a structured format that can be easily processed by NLP algorithms and models.


- Feature Extraction: Tokens serve as features or inputs for various NLP tasks, such as text classification, sentiment analysis, and machine translation.


- Handling Out-of-Vocabulary (OOV) Words: Certain tokenization techniques, like subword tokenization, can handle previously unseen or rare words by breaking them down into smaller, meaningful units.


- Language Understanding: Tokenization helps NLP models understand the structure and meaning of text by separating it into meaningful units.

In [None]:

# Removing punctuation
df_rev_dec['clean_review_word'] = df_rev_dec['review'].str.translate(str.maketrans('', '', string.punctuation))


# Tokenizing the text
df_rev_dec['tokens'] = df_rev_dec['clean_review_word'].apply(word_tokenize)


# Removing stop words
stop_words = set(stopwords.words('english'))
df_rev_dec['tokens'] = df_rev_dec['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
df_rev_dec['tokens_counts'] = df_rev_dec['tokens'].apply(len)

In [None]:
df_rev_dec['tokens_counts']

In [None]:
ax = sns.histplot(data=df_rev_dec, x='tokens_counts',
                  y='rating_int',
                  hue='decision_clean',
                kde=True, 
                  log_scale=(True,False),
                 legend=True)

# Set axis labels
ax.set(xlabel='Submission Length (# Tokens)', ylabel='Review Rating')

# Move the legend outside the plot
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.show()

## Average Token and Sentence Count

In [None]:
df_rev_dec_ave = df_rev_dec.groupby(['forum','decision_clean'])[['rating_int','tokens_counts','sent_count']].mean().reset_index()

In [None]:
ax = sns.histplot(data=df_rev_dec_ave, x='tokens_counts',
                  y='rating_int',
                  hue='decision_clean',
                kde=True, 
                  log_scale=(True,False),
                 legend=True)

# Set axis labels
ax.set(xlabel='Submission Length (# Tokens)', ylabel='Review Rating')

# Move the legend outside the plot
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.show()

In [None]:
ax = sns.histplot(data=df_rev_dec_ave, x='sent_count',
                  y='rating_int',
                  hue='decision_clean',
                kde=True, 
                  log_scale=(True,False),
                 legend=True)

# Set axis labels
ax.set(xlabel='Submission Length (# Sentences)', ylabel='Review Rating')

# Move the legend outside the plot
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.show()

In [None]:
# Concatenate text data
text = ' '.join(df_rev_dec['clean_review_word'])

# Generate word cloud
wordcloud = WordCloud().generate(text)

# Display word cloud
plt.figure(figsize=(8, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Concatenate text data
text = ' '.join(df_keyword['primary_keyword'])

# Generate word cloud
wordcloud = WordCloud().generate(text)

# Display word cloud
plt.figure(figsize=(8, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Concatenate text data
text = ' '.join(df_submissions['abstract'])

# Generate word cloud
wordcloud = WordCloud().generate(text)

# Display word cloud
plt.figure(figsize=(8, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()