In [1]:
import pandas as pd
from collections import defaultdict
import nltk
import re
import random
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from nltk.corpus import reuters, stopwords
from nltk.tokenize import word_tokenize
import string
from nltk import bigrams, FreqDist, ConditionalFreqDist
from nltk.stem import PorterStemmer, WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [2]:
df = pd.read_csv('Coursera_reviews.csv')
print(df.shape)
df.head()

(1048309, 5)


Unnamed: 0,reviews,reviewers,date_reviews,rating,course_id
0,"Pretty dry, but I was able to pass with just t...",By Robert S,"Feb 12, 2020",4.0,google-cbrs-cpi-training
1,would be a better experience if the video and ...,By Gabriel E R,"Sep 28, 2020",4.0,google-cbrs-cpi-training
2,Information was perfect! The program itself wa...,By Jacob D,"Apr 08, 2020",4.0,google-cbrs-cpi-training
3,A few grammatical mistakes on test made me do ...,By Dale B,"Feb 24, 2020",4.0,google-cbrs-cpi-training
4,Excellent course and the training provided was...,By Sean G,"Jun 18, 2020",4.0,google-cbrs-cpi-training


In [3]:
# working with a sample to start with, comment this out later
df1 = df.dropna(subset=['reviews']).copy()
# df1 = df.dropna(subset=['reviews']).iloc[0:100].copy()
df1.head()

Unnamed: 0,reviews,reviewers,date_reviews,rating,course_id
0,"Pretty dry, but I was able to pass with just t...",By Robert S,"Feb 12, 2020",4.0,google-cbrs-cpi-training
1,would be a better experience if the video and ...,By Gabriel E R,"Sep 28, 2020",4.0,google-cbrs-cpi-training
2,Information was perfect! The program itself wa...,By Jacob D,"Apr 08, 2020",4.0,google-cbrs-cpi-training
3,A few grammatical mistakes on test made me do ...,By Dale B,"Feb 24, 2020",4.0,google-cbrs-cpi-training
4,Excellent course and the training provided was...,By Sean G,"Jun 18, 2020",4.0,google-cbrs-cpi-training


In [None]:
def preprocess_text(text):
    # Remove punctuations
    try:
        text = text.translate(str.maketrans('', '', string.punctuation))
    except:
        text = ['majorissue']
    # Remove numbers
    text = ''.join([i for i in text if not i.isdigit()])

    # Remove special characters with space
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase and tokenize and remove stopwords
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    return ' '.join(words)  # rejoin to make a string

df1['clean_reviews'] = df1['reviews'].apply(preprocess_text)
df1.head()


In [None]:
df1[df1['clean_reviews'] == 'majorissue'].head()  # check if there has been any issue with tagging

In [None]:
print('Sample change before and after cleaning')
print(df1.iloc[0:1]['reviews'])
print('\n')
print(df1.iloc[0:1]['clean_reviews'])

In [None]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

def apply_stemmer_lemmetizer(text):
    words = word_tokenize(text)  # tokenising given string
    stemmed_words = [ps.stem(word) for word in words]  # stemming given string
    lemmatised_words = [lm.lemmatize(word) for word in stemmed_words]
    return ' '.join(lemmatised_words)  # rejoin to make a string

df1['lemmatised_reviews'] = df1['clean_reviews'].apply(apply_stemmer_lemmetizer)
df1.head()

In [None]:
print('Sample change before and after lemmitisation')
print(df1.iloc[0:1]['clean_reviews'])
print('\n')
print(df1.iloc[0:1]['lemmatised_reviews'])

In [None]:
# uncomment to improve efficiency in large datasets
# df1.drop('clean_reviews', axis=1, inplace=True)

In [None]:
def pos_tagging(text):
    tokens = word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    return tags

df1['pos_tags'] = df1['lemmatised_reviews'].apply(pos_tagging)
df1.head()

In [None]:
hardcoded_pos_def = {'CC': 'conjunction, coordinating',
                     'CD': 'numeral, cardinal',
                     'DT': 'determiner',
                     'EX': 'existential there',
                     'FW': 'foreign word',
                     'IN': 'preposition or conjunction, subordinating',
                     'JJ': 'adjective or numeral, ordinal',
                     'JJR': 'adjective, comparative',
                     'JJS': 'adjective, superlative',
                     'LS': 'list item marker',
                     'MD': 'modal auxiliary',
                     'NN': 'noun, common, singular or mass',
                     'NNP': 'noun, proper, singular',
                     'NNPS': 'noun, proper, plural',
                     'NNS': 'noun, common, plural',
                     'PDT': 'pre-determiner',
                     'POS': 'genitive marker',
                     'PRP': 'pronoun, personal',
                     'PRP$': 'pronoun, possessive',
                     'RB': 'adverb',
                     'RBR': 'adverb, comparative',
                     'RBS': 'adverb, superlative',
                     'RP': 'particle',
                     'SYM': 'symbol',
                     'TO': '"to" as preposition or infinitive marker',
                     'UH': 'interjection',
                     'VB': 'verb, base form',
                     'VBD': 'verb, past tense',
                     'VBG': 'verb, present participle or gerund',
                     'VBN': 'verb, past participle',
                     'VBP': 'verb, present tense, not 3rd person singular',
                     'VBZ': 'verb, present tense, 3rd person singular',
                     'WDT': 'WH-determiner',
                     'WP': 'WH-pronoun',
                     'WP$': 'WH-pronoun, possessive',
                     'WRB': 'Wh-adverb'}

In [None]:
count_dict = defaultdict(lambda: 0)
for row in df1['pos_tags']:
    for _, key in row:
        count_dict[key] += 1

counts_dict_final = dict(count_dict)
counts_dict_df_ = pd.DataFrame(list(counts_dict_final.items()), columns=['pos_tags', 'occurrence_counts'])
pos_def_df = pd.DataFrame(list(hardcoded_pos_def.items()), columns=['pos_tags', 'tag_definition'])

counts_dict_df = pd.merge(counts_dict_df_, pos_def_df, on='pos_tags', how='left')

N_max = counts_dict_df.shape[0]
print('Total pos tags in data : {}'.format(N_max))

In [None]:
N = int(input("Enter N for looking at top n pos tags, max N {}: ".format(N_max)))
assert N<=N_max, 'Enter valid N <= N_max'

counts_dict_df = counts_dict_df[['pos_tags', 'tag_definition', 'occurrence_counts']]
counts_dict_df = counts_dict_df.sort_values(by='occurrence_counts', ascending=False)
counts_dict_df.head(N)
