In [None]:
# Importing libraries

import numpy as np
import pylab as plt
import pandas as pd

### Data Preparation

In [None]:
PATH = "../input/coursesdata/"
studentInfo = pd.read_csv(PATH + 'studentInfo.csv')
courses = pd.read_csv(PATH + 'courses.csv')
assessments = pd.read_csv(PATH + 'assessments.csv')
studentAssessment = pd.read_csv(PATH + 'studentAssessment.csv')
studentReview = pd.read_csv(PATH + 'studentReview.csv')

studentInfo.head()
# courses.head()
# assessments.head()
# studentAssessment.head()
# studentReview.head()

In [None]:
# Merging Tables
result = pd.merge(studentInfo, courses, left_on=('course','run'), right_on=('course','run'),how='left', sort=False);
result = pd.merge(result, assessments, left_on=('course','run'), right_on=('course','run'),how='left', sort=False);
result = pd.merge(result, studentAssessment, left_on=('student_id','assessment_id'), right_on=('student_id','assessment_id'),how='left', sort=False);
result = pd.merge(result, studentReview, left_on=('student_id','course'), right_on=('student_id','course'),how='left', sort=False);

result.head()

In [None]:
# Reorder Columns
result = result[['student_id','course', 'run',  'gender', 'region', 'highest_education_level', 'age_range', 'completed', 
                 'date_enrolled', 'date_unenrolled', 'course_length', 'assessment_id','assessment_type', 'date', 'weight',
                 'date_submitted', 'score', 'student_review','upgraded']]
result.head()

In [None]:
# Grouping by (student, Course and run), so we can predict for each (user, couurse, run) the upgraded value

result.groupby(['student_id', 'course','run']).agg({
    'gender': lambda x: x[0],
    'region': lambda x: x[0],
    'highest_education_level': lambda x: x[0],
    'age_range': lambda x: x[0],
    'completed': lambda x: x[0],
    'date_enrolled': lambda x: x[0],
    'date_unenrolled': lambda x: x[0],
    'course_length': lambda x: x[0],
    'assessment_id': 'count',
    'assessment_type': lambda x: x[0],
    'date': lambda x: x[0],
    'weight': lambda x: x[0],
    'date_submitted': lambda x: x[0],
    'score': lambda x: x[0],
    'student_review': lambda x: x[0],
    'upgraded' :lambda x: x[0]
})

result.head()

In [None]:
# Creating new User_Course_Run identifier
result['ID'] = result['student_id'].map(str) + '_' + result['course'] + '_' + result['run']

# Making User_Course_Run the first in the dataframe, and removing [student_id, course, run]
result['student_id'] = result['ID']
result.rename(columns={'student_id': 'Student_course_Run_id'}, inplace=True)
result.drop(['course', 'run', 'ID'], axis=1, inplace=True)

result.head()

In [None]:
# Turning non numeric values into numbers using labelEncoder
from sklearn.preprocessing import LabelEncoder

# Lebel encoding Target column
leup = LabelEncoder()
leup.fit(result.upgraded)
result.upgraded=leup.transform(result.upgraded)

cat_cols = ['gender','region','highest_education_level','age_range','completed','date_enrolled','assessment_type']
for col in cat_cols:
    if col in result.columns:
        le = LabelEncoder()
        le.fit(list(result[col].astype(str).values))
        result[col] = le.transform(list(result[col].astype(str).values))
        
result.head()

In [None]:
# Distribution of Target (Most studnets don't upgrade)
import seaborn as sns

sns.countplot(x='upgraded', data=result);

In [None]:
# Running this command, we can see that some columns have missing values
result.info()

In [None]:
# Checking the distribution of each column
result.describe()

In [None]:
# Destribution of columns ['date_enrolled', 'course_length', 'date', 'weight', 'score']

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,6))
boxplot = result.boxplot(column=['date_enrolled', 'course_length', 'date', 'weight', 'score'])

In [None]:
# Filling missig values (We will use the mean to impute the missing values)
result.score = result.score.fillna(result.score.mean())

result['date_submitted'] = result['date_submitted'].fillna(result['date_submitted'].mean())
result['date_unenrolled'] = result['date_unenrolled'].fillna(result['date_unenrolled'].mean())
result['date'] = result['date'].fillna(result['date'].mean())

## Building Baseline Model

In [None]:
# For the baseline model, we will use just the numeric columns.
# In order to not lose the review effect, We will create a "student_review_len"
# column before removing the "student_review" column.

def add_review_features(df):
    df['student_review'] = df['student_review'].apply(lambda x:str(x))
    df['student_review_len'] = df['student_review'].apply(len)
    df['student_review_n_capitals'] = df['student_review'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['student_review_n_words'] = df['student_review'].str.count('\S+')
    return df

result = add_review_features(result)
# Removing unique identifiers + studnt review
data = result.drop(['assessment_id','student_review'],axis=1)
data.head()

In [None]:
# Splitting data into 80% training and 20% test
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

X = data.drop(['Student_course_Run_id', 'upgraded'],axis=1)
y = data.upgraded

# Standardize features by removing the mean and deviding by variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)

# Accuracies and F-Scores across k folds
accs, fsc = [], []

print(skf)
StratifiedKFold(n_splits=5, random_state=10, shuffle=False)
for train_index, test_index in skf.split(X, y):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create Model
    clf =  RandomForestClassifier(n_estimators=10, random_state=10)
    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)
    # Predict the response for test dataset
    y_pred = clf.predict(X_test)
    
    # Evaluate performance
    print("Fold Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Fold F1-Score:",metrics.f1_score(y_test, y_pred), end='\n\n')
    accs.append(metrics.accuracy_score(y_test, y_pred))
    fsc.append(metrics.f1_score(y_test, y_pred))
    
print("Overall Accuracy: {:0.2f} +/- {:0.2f}".format(np.mean(accs), np.std(accs)))
print("Overall F1-Score: {:0.2f} +/- {:0.2f}".format(np.mean(fsc), np.std(fsc)))

## WordCrouds

In [None]:
# First we will plot WordCrouds for the two classes (upgrade) and (Not upgrade)
# We can see that words like "Great" are indicators for the decision of the student

from wordcloud import WordCloud
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

n_posts = 1000
data = result
rev_Up = ' '.join(data[data['upgraded'] == 0]['student_review'].str.lower().values[:n_posts])
rev_Nup = ' '.join(data[data['upgraded'] == 1]['student_review'].str.lower().values[:n_posts])

wordcloud_S = WordCloud(max_words=20, scale = 2, stopwords=stop, contour_width=3, contour_color='steelblue').generate(rev_Up)
wordcloud_I = WordCloud(max_words=20, scale = 2, stopwords=stop, contour_width=3, contour_color='steelblue').generate(rev_Nup)

fig, ax = plt.subplots(1,2, figsize=(22, 6))
ax[0].imshow(wordcloud_S)
ax[0].set_title('Top words studnet review (Not upgrade)',fontsize = 20)
ax[0].axis("off")

ax[1].imshow(wordcloud_I)
ax[1].set_title('Top words studnet review (upgrade)',fontsize = 20)
ax[1].axis("off")

plt.show()

## Creating model using student Review (Bert Large)

In [None]:
## Official Tokenizer: create input_ids, input_masks, and segment_ids
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
# Importing libraries
import os, re, pickle
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Tensorflow imports
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

In [None]:
def nlp_preprocessing(text):
    filter_char = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
    text = text.lower()
    text = text.replace(filter_char,'')
    text = text.replace('[^a-zA-Z0-9 ]', '')
    return text

result["student_review"] = result["student_review"].apply(nlp_preprocessing)

In [None]:
X_train, X_test, _, _ = train_test_split(result, result, test_size=0.15, random_state=42)

### Helper Functions:

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, n_num, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    input_num = Input(shape=(n_num,))
    hidden = concatenate([clf_output, input_num])
    #hidden = Dense(10, activation='relu')(hidden)
    
    out = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids, input_num], outputs=out)
    model.compile(Adam(lr=2e-2), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
# Load BERT from Tensorflow Hub
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=False)

In [None]:
# Load tokenizer from BERT Layer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
# Scaling Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Drop Unique identifier
result = result.drop(['assessment_id'],axis=1)
X_num  = result.drop(['Student_course_Run_id', 'upgraded', 'student_review'],axis=1)
X_text = result["student_review"]
y      = result.upgraded

# Standardize features by removing the mean and deviding by variance
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
X_num_scaled = pd.DataFrame(X_num_scaled)

In [None]:
%%time
# Encode the text into tokens, masks, and segment flags
X_text = bert_encode(X_text.values, tokenizer, max_len=50)

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import metrics

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_num, y)

# Accuracies and F-Scores across k folds
accs, precs, fsc = [], [], []

# numerc Input Shape
n_num = X_num.shape[-1]
Bsize = 128 #256

StratifiedKFold(n_splits=5, random_state=10, shuffle=False)
for train_index, test_index in skf.split(X_num, y):
    
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_num_train, X_num_test = X_num.iloc[train_index], X_num.iloc[test_index]
    
    X_text_ids_train, X_text_ids_test = X_text[0][train_index], X_text[0][test_index]
    X_text_masks_train, X_text_masks_test = X_text[1][train_index], X_text[1][test_index]
    X_text_seg_train, X_text_seg_test = X_text[2][train_index], X_text[2][test_index]
    
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create Model
    model = build_model(bert_layer, n_num, max_len=50)
    
    # Train Decision Tree Classifer
    train_history = model.fit(
        [X_text_ids_train, X_text_masks_train, X_text_seg_train, X_num_train],
        y_train,
        validation_split=0.2,
        epochs=3,
        batch_size=Bsize
    )

    # model.save('model.h5')
    
    # Predict the response for test dataset
    y_pred = model.predict([X_text_ids_test, X_text_masks_test, X_text_seg_test, X_num_test],
                           batch_size=Bsize)
    
    # Evaluate performance
    print("Fold Accuracy:",metrics.accuracy_score(y_test, y_pred.round()))
    print("Fold Precision:",metrics.precision_score(y_test, y_pred.round()))
    print("Fold F1-Score:",metrics.f1_score(y_test, y_pred.round()), end='\n\n')
    
    accs.append(metrics.accuracy_score(y_test, y_pred.round()))
    precs.append(metrics.precision_score(y_test, y_pred.round()))
    fsc.append(metrics.f1_score(y_test, y_pred.round()))
    
print("Overall Accuracy: {:0.2f} +/- {:0.2f}".format(np.mean(accs), np.std(accs)))
print("Overall Precision: {:0.2f} +/- {:0.2f}".format(np.mean(precs), np.std(precs)))
print("Overall F1-Score: {:0.2f} +/- {:0.2f}".format(np.mean(fsc), np.std(fsc)))

In [None]:
model = build_model(bert_layer, n_num, max_len=50)
model.summary()

## Conclusion:
...

### References:
- https://www.tensorflow.org/hub/migration_tf2
- https://www.tensorflow.org/hub/tf2_saved_model