In [None]:
# Importing libraries

import numpy as np
import pylab as plt
import pandas as pd

### Data Preparation

In [None]:
PATH = "../input/coursesdata/"
studentInfo = pd.read_csv(PATH + 'studentInfo.csv')
courses = pd.read_csv(PATH + 'courses.csv')
assessments = pd.read_csv(PATH + 'assessments.csv')
studentAssessment = pd.read_csv(PATH + 'studentAssessment.csv')
studentReview = pd.read_csv(PATH + 'studentReview.csv')

studentInfo.head()
# courses.head()
# assessments.head()
# studentAssessment.head()
# studentReview.head()

In [None]:
# Merging Tables
result = pd.merge(studentInfo, courses, left_on=('course','run'), right_on=('course','run'),how='left', sort=False);
result = pd.merge(result, assessments, left_on=('course','run'), right_on=('course','run'),how='left', sort=False);
result = pd.merge(result, studentAssessment, left_on=('student_id','assessment_id'), right_on=('student_id','assessment_id'),how='left', sort=False);
result = pd.merge(result, studentReview, left_on=('student_id','course'), right_on=('student_id','course'),how='left', sort=False);

result.head()

In [None]:
# Reorder Columns
result = result[['student_id','course', 'run',  'gender', 'region', 'highest_education_level', 'age_range', 'completed', 
                 'date_enrolled', 'date_unenrolled', 'course_length', 'assessment_id','assessment_type', 'date', 'weight',
                 'date_submitted', 'score', 'student_review','upgraded']]
result.head()

In [None]:
# Grouping by (student, Course and run), so we can predict for each (user, couurse, run) the upgraded value

result.groupby(['student_id', 'course','run']).agg({
    'gender': lambda x: x[0],
    'region': lambda x: x[0],
    'highest_education_level': lambda x: x[0],
    'age_range': lambda x: x[0],
    'completed': lambda x: x[0],
    'date_enrolled': lambda x: x[0],
    'date_unenrolled': lambda x: x[0],
    'course_length': lambda x: x[0],
    'assessment_id': 'count',
    'assessment_type': lambda x: x[0],
    'date': lambda x: x[0],
    'weight': lambda x: x[0],
    'date_submitted': lambda x: x[0],
    'score': lambda x: x[0],
    'student_review': lambda x: x[0],
    'upgraded' :lambda x: x[0]
})

result.head()

In [None]:
# Creating new User_Course_Run identifier
result['ID'] = result['student_id'].map(str) + '_' + result['course'] + '_' + result['run']

# Making User_Course_Run the first in the dataframe, and removing [student_id, course, run]
result['student_id'] = result['ID']
result.rename(columns={'student_id': 'Student_course_Run_id'}, inplace=True)
result.drop(['course', 'run', 'ID'], axis=1, inplace=True)

result.head()

In [None]:
# Turning non numeric values into numbers using labelEncoder
from sklearn.preprocessing import LabelEncoder

# Lebel encoding Target column
leup = LabelEncoder()
leup.fit(result.upgraded)
result.upgraded=leup.transform(result.upgraded)

cat_cols = ['gender','region','highest_education_level','age_range','completed','date_enrolled','assessment_type']
for col in cat_cols:
    if col in result.columns:
        le = LabelEncoder()
        le.fit(list(result[col].astype(str).values))
        result[col] = le.transform(list(result[col].astype(str).values))
        
result.head()

In [None]:
# Distribution of Target (Most studnets don't upgrade)
import seaborn as sns

sns.countplot(x='upgraded', data=result);

In [None]:
# Running this command, we can see that some columns have missing values
result.info()

In [None]:
# Checking the distribution of each column
result.describe()

In [None]:
# Destribution of columns ['date_enrolled', 'course_length', 'date', 'weight', 'score']

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,6))
boxplot = result.boxplot(column=['date_enrolled', 'course_length', 'date', 'weight', 'score'])

In [None]:
# Filling missig values (We will use the mean to impute the missing values)
result.score = result.score.fillna(result.score.mean())

result['date_submitted'] = result['date_submitted'].fillna(result['date_submitted'].mean())
result['date_unenrolled'] = result['date_unenrolled'].fillna(result['date_unenrolled'].mean())
result['date'] = result['date'].fillna(result['date'].mean())

## Building Baseline Model

In [None]:
# For the baseline model, we will use just the numeric columns.
# In order to not lose the review effect, We will create a "student_review_len"
# column before removing the "student_review" column.

def add_review_features(df):
    df['student_review'] = df['student_review'].apply(lambda x:str(x))
    df['student_review_len'] = df['student_review'].apply(len)
    df['student_review_n_capitals'] = df['student_review'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['student_review_n_words'] = df['student_review'].str.count('\S+')
    return df

result = add_review_features(result)
# Removing unique identifiers + studnt review
data = result.drop(['assessment_id','student_review'],axis=1)
data.head()

In [None]:
# Splitting data into 80% training and 20% test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

X = data.drop(['Student_course_Run_id', 'upgraded'],axis=1)
y = data.upgraded

# Standardize features by removing the mean and deviding by variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)

# Accuracies and F-Scores across k folds
accs, fsc = [], []

print(skf)
StratifiedKFold(n_splits=5, random_state=10, shuffle=False)
for train_index, test_index in skf.split(X, y):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create Model
    clf =  RandomForestClassifier(n_estimators=10, random_state=10)
    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)
    # Predict the response for test dataset
    y_pred = clf.predict(X_test)
    
    # Evaluate performance
    print("Fold Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Fold F1-Score:",metrics.f1_score(y_test, y_pred), end='\n\n')
    accs.append(metrics.accuracy_score(y_test, y_pred))
    fsc.append(metrics.f1_score(y_test, y_pred))
    
print("Overall Accuracy: {:0.2f} +/- {:0.2f}".format(np.mean(accs), np.std(accs)))
print("Overall F1-Score: {:0.2f} +/- {:0.2f}".format(np.mean(fsc), np.std(fsc)))

## WordCrouds

In [None]:
# First we will plot WordCrouds for the two classes (upgrade) and (Not upgrade)
# We can see that words like "Great" are indicators for the decision of the student

from wordcloud import WordCloud
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

n_posts = 1000
data = result
rev_Up = ' '.join(data[data['upgraded'] == 0]['student_review'].str.lower().values[:n_posts])
rev_Nup = ' '.join(data[data['upgraded'] == 1]['student_review'].str.lower().values[:n_posts])

wordcloud_S = WordCloud(max_words=20, scale = 2, stopwords=stop, contour_width=3, contour_color='steelblue').generate(rev_Up)
wordcloud_I = WordCloud(max_words=20, scale = 2, stopwords=stop, contour_width=3, contour_color='steelblue').generate(rev_Nup)

fig, ax = plt.subplots(1,2, figsize=(22, 6))
ax[0].imshow(wordcloud_S)
ax[0].set_title('Top words studnet review (Not upgrade)',fontsize = 20)
ax[0].axis("off")

ax[1].imshow(wordcloud_I)
ax[1].set_title('Top words studnet review (upgrade)',fontsize = 20)
ax[1].axis("off")

plt.show()

## Creating model using student Review (Bert Large)

In [None]:
# Importing libraries
import os, re, pickle
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# pytorch bert imports
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

# keras imports
from keras.utils import np_utils
from keras.preprocessing import text, sequence
from keras.layers import CuDNNLSTM, LSTM, Activation, Dense, Dropout, Input, Embedding, concatenate, Bidirectional
from keras.layers import SpatialDropout1D, Dropout, add, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.optimizers import Adam, SGD
from keras.models import Sequential, Model
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.losses import binary_crossentropy
from keras import backend as K

In [None]:
BERT_PRETRAINED_DIR = '../input/pretrained-bert-models-for-pytorch/bert-base-uncased/'
BERT_VOCAB_DIR = '../input/pretrained-bert-models-for-pytorch/bert-base-uncased-vocab.txt'
MAX_LENGTH = 50 # Because review_len_mean is near 40

In [None]:
def nlp_preprocessing(text):
    filter_char = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
    text = text.lower()
    text = text.replace(filter_char,'')
    text = text.replace('[^a-zA-Z0-9 ]', '')
    return text

result["student_review"] = result["student_review"].apply(nlp_preprocessing)

In [None]:
# Initialising BERT tokenizer
tokenizer = BertTokenizer(vocab_file=BERT_VOCAB_DIR)
def tokenization(row):
    row = tokenizer.tokenize(row)
    row = tokenizer.convert_tokens_to_ids(row)
    return row

result["student_review"] = result["student_review"].apply(tokenization)

# Cheking some review after tokenization
result["student_review"].sample(20).head()

In [None]:
def string_ids(doc):
    doc = [str(i) for i in doc]
    return ' '.join(doc)

result["student_review"] = result["student_review"].apply(string_ids)

In [None]:
X_text = np.zeros((result.shape[0],MAX_LENGTH),dtype=np.int)
X_num  = X_scaled #Numerical features
for i,ids in enumerate(list(result['student_review'])):
    input_ids = [int(i) for i in ids.split()[:MAX_LENGTH]]
    inp_len = len(input_ids)
    X_text[i,:inp_len] = np.array(input_ids)

In [None]:
def get_bert_embed_matrix():
    bert = BertModel.from_pretrained(BERT_PRETRAINED_DIR)
    bert_embeddings = list(bert.children())[0]
    bert_word_embeddings = list(bert_embeddings.children())[0]
    mat = bert_word_embeddings.weight.data.numpy()
    return mat

embedding_matrix = get_bert_embed_matrix()

In [None]:
LSTM_UNITS = 128
HIDDEN_UNITS = 4 * LSTM_UNITS
N_NUMERICAL  = X_scaled.shape[-1]

def build_model(embedding_matrix):
    
    words = Input(shape=(MAX_LENGTH,))
    numerics = Input(shape=(N_NUMERICAL,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.5)(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([GlobalMaxPooling1D()(x),GlobalAveragePooling1D()(x),])
    hidden = add([hidden, Dense(HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(HIDDEN_UNITS, activation='relu')(hidden)])
    
    hidden = concatenate([hidden, numerics])
    out = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[words, numerics], outputs=out)
    model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=Adam(lr = 0.001))

    return model

# Checking Model Architecture
build_model(embedding_matrix).summary()

In [None]:
tr_idx, val_idx = train_test_split(list(range(len(X_text))) ,test_size=0.2, random_state = 100)

In [None]:
EPOCHS = 5

# Model Training and prediction phase
model = build_model(embedding_matrix)

model.fit(
    [X_text[tr_idx], X_num[tr_idx]], y[tr_idx],
    validation_data = ([X_text[val_idx], X_num[val_idx]], y[val_idx]),
    batch_size = 500,
    epochs = EPOCHS,
    verbose = 1,
    callbacks=[LearningRateScheduler(lambda epoch: 1e-3 * (0.6 ** epoch))]
)
model_val_preds = model.predict([X_text[val_idx], X_num[val_idx]], batch_size=1000).flatten()

In [None]:
from sklearn.metrics import f1_score, accuracy_score

# Convert predictions to int, so we can compute metrics
y_val = (np.array(model_val_preds) > 0.5).astype(np.int)

print("Accuracy: {:0.2f}".format(accuracy_score(y[val_idx], y_val)))
print("F1-Score: {:0.2f}".format(f1_score(y[val_idx], y_val)))

## Conclusion:
We can see that by adding student reviews embeddings, we were able to get some inprovement in F1-score
In This case we have an umbalenced dataset, so even if we have got higher accuracy for both the first model (without reviews)
and the second model (with reviews embeddings), the F1-score is the more important metric to consider in this case.