# **Project Imports**





In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from tqdm import tqdm
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **Coursera Reviews**

Dataset found on kaggle: https://www.kaggle.com/septa97/100k-courseras-course-reviews-dataset

Reviews from students on courses offered by Coursera. The csv file has the text for the reviews along with the corresponding 1 to 5 rating of the course. There is a little over 100,000 reviews contained within the file.


In [None]:
pd.set_option('display.max_colwidth', -1)

# The /content/drive/MyDrive/reviews.csv is if you are using google colab and
# mounted your google drive. I will provide the csv file along with the code
# so you may not need this.
# review_data = pd.read_csv('/content/drive/MyDrive/reviews.csv')

review_data = pd.read_csv('reviews.csv')
review_data.shape, display(review_data.head(10))

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I'm still learning this class which makes up a lot of basic music knowledge.",5
2,2,like!Prof and TAs are helpful and the discussion among students are quite active. Very rewarding learning experience!,5
3,3,Easy to follow and includes a lot basic and important techniques to use sketchup.,5
4,4,Really nice teacher!I could got the point eazliy but the v,4
5,5,"Great course - I recommend it for all, especially IT and Business Managers!",5
6,6,One of the most useful course on IT Management!,5
7,7,"I was disappointed because the name is misleading. The course provides a good introduction & overview of the responsibilities of the CTO, but has very little specifically digital content. It deals with two-speed IT in a single short lecture, so of course the treatment is superficial. It is easy to find more in-depth material freely available, on the McKinsey website for example.",3
8,8,Super content. I'll definitely re-do the course,5
9,9,One of the excellent courses at Coursera for information technology bosses and managers.,5


((107018, 3), None)

In [None]:
# check for empty cells
review_data.isnull().sum()

Id        0
Review    0
Label     0
dtype: int64

In [None]:
# no duplicates
review_data = review_data.drop_duplicates(keep="first")
review_data.shape

(107018, 3)

In [None]:
# class distribution
review_data.Label.value_counts()

5    79173
4    18054
3    5071 
1    2469 
2    2251 
Name: Label, dtype: int64

# **Text Preprocessing**

In [None]:
def decontracted(phrase):

    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase) 
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    return phrase

def clean_text(df):
    cleaned_review = []

    for review_text in tqdm(df['Review']):
        
        review_text = decontracted(review_text)     
        review_text = BeautifulSoup(review_text, 'lxml').get_text().strip()
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        review_text = re.sub(r'https?://\S+|www\.\S+', '', review_text)
        review_text = review_text.translate(str.maketrans('', '', string.punctuation))
        review_text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", '', review_text)
        review_text = review_text.lower()
        cleaned_review.append(review_text)

    return cleaned_review

review_data['cleaned_review'] = clean_text(review_data)

100%|██████████| 107018/107018 [00:27<00:00, 3831.39it/s]


In [None]:
# Showing original review versus the cleaned review
review_data.head()

Unnamed: 0,Id,Review,Label,cleaned_review
0,0,good and interesting,5,good and interesting
1,1,"This class is very helpful to me. Currently, I'm still learning this class which makes up a lot of basic music knowledge.",5,this class is very helpful to me currently i am still learning this class which makes up a lot of basic music knowledge
2,2,like!Prof and TAs are helpful and the discussion among students are quite active. Very rewarding learning experience!,5,like prof and tas are helpful and the discussion among students are quite active very rewarding learning experience
3,3,Easy to follow and includes a lot basic and important techniques to use sketchup.,5,easy to follow and includes a lot basic and important techniques to use sketchup
4,4,Really nice teacher!I could got the point eazliy but the v,4,really nice teacher i could got the point eazliy but the v


In [None]:
# Takes around 20 seconds
tokenized_reviews = [word_tokenize(i) for i in review_data['cleaned_review']]

In [None]:
tokenized_words = []
for reviews in tokenized_reviews:
  for word in reviews:
    tokenized_words.append(word.lower())

In [None]:
# Takes around 150 seconds
vocab = []
for words in tokenized_reviews:
  if words not in vocab:
    vocab.append(words)

# **Basic Analysis**

In [None]:
review_data_info = []

text = review_data['cleaned_review']
n_tokens = len(tokenized_words)
n_reviews = len(tokenized_reviews)
n_types = len(vocab)

review_data_info.append({
    'Tokens': n_tokens,
    'Vocab': n_types,
    'Lexical Diversity': "%.3f" % (n_types/n_tokens),
    'Average Review Length': "%.3f" % (n_tokens/n_reviews),
    'Average Word Frequency': "%.3f" % (n_tokens/n_types)
})

display(pd.DataFrame(review_data_info)[['Tokens', 'Vocab', 'Lexical Diversity', 'Average Review Length', 'Average Word Frequency' ]])

Unnamed: 0,Tokens,Vocab,Lexical Diversity,Average Review Length,Average Word Frequency
0,2698633,96968,0.036,25.217,27.83


# **Converting Ratings to Binary Labels**

In [None]:
review_data['tf'] = review_data['Label'].map({5:1, 4:1, 3:0, 2:0, 1:0})
review_data.head(10)[['cleaned_review', 'Label', 'tf']]

Unnamed: 0,cleaned_review,Label,tf
0,good and interesting,5,1
1,this class is very helpful to me currently i am still learning this class which makes up a lot of basic music knowledge,5,1
2,like prof and tas are helpful and the discussion among students are quite active very rewarding learning experience,5,1
3,easy to follow and includes a lot basic and important techniques to use sketchup,5,1
4,really nice teacher i could got the point eazliy but the v,4,1
5,great course i recommend it for all especially it and business managers,5,1
6,one of the most useful course on it management,5,1
7,i was disappointed because the name is misleading the course provides a good introduction overview of the responsibilities of the cto but has very little specifically digital content it deals with two speed it in a single short lecture so of course the treatment is superficial it is easy to find more in depth material freely available on the mckinsey website for example,3,0
8,super content i will definitely re do the course,5,1
9,one of the excellent courses at coursera for information technology bosses and managers,5,1


# **Count Vectorizer**

In [None]:
stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself',\
                 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',\
                 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these',\
                 'those', 'am', 'is', 'are', 'was', 'were', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',\
                 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',\
                 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',\
                 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',\
                 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'such', 'only', 'own', 'no', 'same',\
                 'so', 'than', 'too', 'very', 's', 'can', 'will', 'just', 'don', 'should', 'now', 'be', 'some', 'much', 'would', 'could'])

In [None]:
# max_df -> not allowing words that appear more than 70% in the document
# min_df -> not allowing words that appear less than 0.1% in the document
# ngram_range -> both unigrams and bigrams
vectorizer = CountVectorizer(lowercase=True, stop_words=stopwords, max_df=0.7, min_df=0.001, ngram_range=(1,2))
X = vectorizer.fit_transform(review_data.cleaned_review)
y = review_data.tf.values

print(X.toarray())

print(X.shape)
print(y.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(107018, 2139)
(107018,)


# **Train Test Split**

In [None]:
review_data_redix = review_data.reset_index(drop=True)
review_data_redix.tail()

Unnamed: 0,Id,Review,Label,cleaned_review,tf
107013,107013,Trendy topic with talks from expertises in the field. Covered all areas of interest. Congrats!,4,trendy topic with talks from expertises in the field covered all areas of interest congrats,1
107014,107014,"Wonderful! Simple and clear language, good instructors, great stuff! Highly recommend!",5,wonderful simple and clear language good instructors great stuff highly recommend,1
107015,107015,an interesting and fun course. thanks. dr quincy,5,an interesting and fun course thanks dr quincy,1
107016,107016,"very broad perspective, up to date information, useful links and videos and good lecturers in general. Thank you for the insights and knowledge.",4,very broad perspective up to date information useful links and videos and good lecturers in general thank you for the insights and knowledge,1
107017,107017,"An informative course on the social and financial implications due to Zika as well as the factors leading to an epidemic of Zika virus,,",4,an informative course on the social and financial implications due to zika as well as the factors leading to an epidemic of zika virus,1


In [None]:
train_idx, test_idx = train_test_split(np.arange(review_data_redix.shape[0]), test_size = 0.2, shuffle=True, random_state=42)

len(train_idx), len(test_idx)
print("Number of training examples:{}".format(len(train_idx)))
print("Number of testing examples:{}".format(len(test_idx)))

Number of training examples:85614
Number of testing examples:21404


In [None]:
X_train = X[train_idx]
y_train = y[train_idx]

X_test = X[test_idx]
y_test = y[test_idx]

print("Training data: X_train : {}, y_train : {}".format(X_train.shape, y_train.shape))
print("Testing data: X_test : {}, y_test : {}".format(X_test.shape, y_test.shape))

Training data: X_train : (85614, 2139), y_train : (85614,)
Testing data: X_test : (21404, 2139), y_test : (21404,)


In [None]:
# Since positive and negative imbalance randomly undersample the postive class
# to match both positive and negative classes
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

# **Logistic Regression**

In [None]:
# Without undersampling
lr_clf = LogisticRegression(max_iter=7600)
lr_clf.fit(X_train, y_train)
y_pred_test = lr_clf.predict(X_test)
y_predprob_test = lr_clf.predict_proba(X_test)

In [None]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.75      0.48      0.58      1910
           1       0.95      0.98      0.97     19494

    accuracy                           0.94     21404
   macro avg       0.85      0.73      0.78     21404
weighted avg       0.93      0.94      0.93     21404



In [None]:
# With undersampling
lr_clf_under = LogisticRegression(max_iter=7600)
lr_clf_under.fit(X_train_under, y_train_under)
y_pred_test_under = lr_clf_under.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_test_under))

              precision    recall  f1-score   support

           0       0.38      0.85      0.53      1910
           1       0.98      0.86      0.92     19494

    accuracy                           0.86     21404
   macro avg       0.68      0.86      0.72     21404
weighted avg       0.93      0.86      0.89     21404



# **Cross Validation Score**

In [None]:
scores = cross_val_score(lr_clf, X, y, cv=5, scoring='precision')
print(scores)

[0.95014295 0.94757823 0.94633907 0.94238643 0.95254815]


# **Parameter Tuning**

In [None]:
# Takes around 7 minutes and 26 seconds
vectorizer_grid = CountVectorizer(lowercase=True, stop_words=stopwords)
logistic_grid = LogisticRegression(max_iter=7600)

pipe = Pipeline(steps = [
       ('vectorizer', vectorizer_grid),
       ('classifier', logistic_grid)])

param_grid = {
    'vectorizer__min_df': [0.01, 0.02, 0.05, 0.1, 0.3, 0.5],
    'vectorizer__max_df': [0.7, 0.8, 0.9],
    'classifier__penalty': ['l2', 'none']}

search_result = GridSearchCV(pipe, param_grid, cv=5, scoring='f1').fit(review_data.cleaned_review.values, review_data.tf.values)

print("Best parameter (CV score=%0.3f):" % search_result.best_score_)
print(search_result.best_params_)

Best parameter (CV score=0.958):
{'classifier__penalty': 'l2', 'vectorizer__max_df': 0.7, 'vectorizer__min_df': 0.01}


# **Model Performance**

In [None]:
review_data_test = review_data_redix.iloc[test_idx]
review_data_test['pred_tf'] = y_pred_test
review_data_test.head()[['cleaned_review', 'Label', 'tf', 'pred_tf']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,cleaned_review,Label,tf,pred_tf
41096,good enough for beginners and the puppet is funny,4,1,1
32365,realy good,4,1,1
38781,a very comprehensive and thrilling narration one of my best mooc on the around twenty i did,5,1,1
91981,i like the step by step learning although it might be found slow for more experienced students but it is very clear and well structured for beginners in programming and python,5,1,1
9679,great explanations by the course guide,5,1,1


In [None]:
review_data_test[review_data_test['tf'] == review_data_test['pred_tf']].tail()[['cleaned_review', 'Label', 'tf', 'pred_tf']]

Unnamed: 0,cleaned_review,Label,tf,pred_tf
87718,this covers techniques to harvest data from online sources a common first steps in many projects,5,1,1
69096,the instructors are excellent and the material is good the only drawback is the need to use graphlab this would have been a really great course if we had to use open source software,4,1,1
64493,a great beginers course in the networking field youll learn what are the basics of network,4,1,1
52144,i liked how every lesson built on itself and in the end you had a full completed lesson plan,5,1,1
79343,very poor quality the teacher is talking and talking without saying anything concrete or giving some tools or practical suggestions,1,0,0


In [None]:
review_data_test[review_data_test['tf'] != review_data_test['pred_tf']].tail()[['cleaned_review', 'Label', 'tf', 'pred_tf']]

Unnamed: 0,cleaned_review,Label,tf,pred_tf
61573,the lesson is informative but the course professor does not share value on each slide she is reading each point and does not provide additional insight,3,0,1
38228,material is great while separating assignment to several parts in confusing for me personally i do not see why we need first parts when the resulting project contains all from the previous parts this separation is justified in my mind for big projects and these assignments were not that big,4,1,0
36756,assignments could have been a little tougher with more emphasis on coding,3,0,1
24410,it is not a recommended course when they explain something they assume that we are expert and we know all what they are talking about they need to have statistics experts which is not mentioned in course prerequisite i recommend to review the material and to review the purpose of this course,1,0,1
95996,this course was more diverse than i thought it would be and i really enjoyed it however i thought the end of year project could have provided more differentiated prep material as that confused me,3,0,1


In [None]:
feature_to_coef = {word: float("%.3f" % coef) for word, coef in zip(vectorizer.get_feature_names(), lr_clf.coef_[0])}

print("Top positive features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:10]

Top positive features:


[('awesome', 2.57),
 ('completed course', 2.538),
 ('excellent', 2.429),
 ('not difficult', 2.314),
 ('enjoying', 2.299),
 ('stimulating', 2.185),
 ('amazing', 2.094),
 ('exceptional', 2.068),
 ('fantastic', 2.057),
 ('rewarding', 2.029)]

In [None]:
print("Top negative features:")
sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=False)[:10]

Top negative features:


[('worst', -3.224),
 ('useless', -2.874),
 ('poorly', -2.794),
 ('outdated', -2.392),
 ('poor', -2.353),
 ('disappointing', -2.227),
 ('disappointed', -2.207),
 ('lacks', -2.132),
 ('not recommend', -1.996),
 ('not well', -1.96)]

# **Another Attempt for the Undersampler**

In [None]:
# Trying another min_df
vectorizer_new = CountVectorizer(lowercase=True, stop_words=stopwords, max_df=0.7, min_df=0.1, ngram_range=(1,2))
X_new = vectorizer_new.fit_transform(review_data.cleaned_review)
y_new = review_data.tf.values

In [None]:
train_idx_new, test_idx_new = train_test_split(np.arange(review_data_redix.shape[0]), test_size = 0.2, shuffle=True, random_state=42)

len(train_idx_new), len(test_idx_new)
print("Number of training examples:{}".format(len(train_idx_new)))
print("Number of testing examples:{}".format(len(test_idx_new)))

Number of training examples:85614
Number of testing examples:21404


In [None]:
X_train_new = X_new[train_idx_new]
y_train_new = y_new[train_idx_new]

X_test_new = X_new[test_idx_new]
y_test_new = y_new[test_idx_new]

print("Training data: X_train : {}, y_train : {}".format(X_train_new.shape, y_train_new.shape))
print("Testing data: X_test : {}, y_test : {}".format(X_test_new.shape, y_test_new.shape))

Training data: X_train : (85614, 5), y_train : (85614,)
Testing data: X_test : (21404, 5), y_test : (21404,)


In [None]:
undersample_new = RandomUnderSampler(sampling_strategy='majority')
X_train_under_new, y_train_under_new = undersample.fit_resample(X_train_new, y_train_new)

In [None]:
# Without undersampling
lr_clf_new = LogisticRegression(max_iter=7600)
lr_clf_new.fit(X_train_new, y_train_new)
y_pred_test_new = lr_clf_new.predict(X_test_new)

In [None]:
print(classification_report(y_test_new, y_pred_test_new))

              precision    recall  f1-score   support

           0       0.58      0.14      0.22      1910
           1       0.92      0.99      0.95     19494

    accuracy                           0.91     21404
   macro avg       0.75      0.56      0.59     21404
weighted avg       0.89      0.91      0.89     21404



In [None]:
# With undersampling
lr_clf_under_new = LogisticRegression(max_iter=7600)
lr_clf_under_new.fit(X_train_under_new, y_train_under_new)
y_pred_test_under_new = lr_clf_under_new.predict(X_test_new)

In [None]:
print(classification_report(y_test_new, y_pred_test_under_new))

              precision    recall  f1-score   support

           0       0.37      0.47      0.42      1910
           1       0.95      0.92      0.93     19494

    accuracy                           0.88     21404
   macro avg       0.66      0.70      0.68     21404
weighted avg       0.90      0.88      0.89     21404



# **Conclusions**

Some of the key highlights:

1. Messing with the parameters in CountVectorizer can improve the performance greatly on its own. 

2. The random undersampler works by targeting the majority class, in this case the positive reviews, and randomly selecting some for the training data to match the number of negative samples.

3. The performance of the random undersampler improves the recall score for the negative class greatly and the precision score for the positive class slightly. However, the other metrics are slightly worse. 

4. Currently the min_df is 0.001 which I take to mean no words appearing less than 0.1% of the document. making the min_df 0.1 and rerunning everything the performance with and without undersampling is different. While precision goes down slightly for the negative class both its recall and f1-score increase greatly. Another is the positive class all of its scores are still above 0.90

5. In my case, just by changing the parameters of CountVectorizer seems to yield better results versus undersampling. In the first instance of the classification report with the min_df as 0.001 the scores here were better than all other instances. 

6. The top-n features make sense for both classes. Having ngram_range as (1, 2) allows for both unigrams and bigrams also helps. For example the words recommend and well in reference to a class usually have a positive conotation. However, both are preceded by a not which now changes how this word is used which is why its in the negative class. Otherwise the top-n words make sense on why they are associated to their respective class. 

7. The vocab is not as complex due to the low lexical diversity score


