In [111]:
import pandas as pd
import numpy as np
import glob as glob
import os
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Input, Embedding, Dot
from tensorflow.keras.optimizers import Adam, Adagrad
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
# from data_parser import DataParser
# from skip_gram import SkipGramModel

In [112]:
courses = pd.read_csv('/Users/rsciagli/documents/Fall2020/BAR/STU_CRS_TBL_full.csv', encoding='latin')
#courses.head()

In [114]:
#plt.hist(courses['CRS_SUBJ_DEPT_CD'].value_counts(), log=True, bins=[0,10,20,30,40,50,60,70,80,90,100])
crs_dept_cd = courses['CRS_SUBJ_DEPT_CD'].value_counts()
big_dept = crs_dept_cd[crs_dept_cd>10].index
pruned_dept = np.where(courses['CRS_SUBJ_DEPT_CD'].isin(big_dept),courses['CRS_SUBJ_DEPT_CD'],'other_dept')
#pd.Series(pruned_dept).value_counts()['other_dept']

In [115]:
subj_by_id = courses['CRS_ID'].value_counts()
big_subj_by_id = subj_by_id[subj_by_id>10].index
pruned_id = np.where(courses['CRS_ID'].isin(big_subj_by_id),courses['CRS_ID'],pruned_dept)
#pd.Series(pruned_id).value_counts()

In [116]:
courses['crs_ofcl_grd_nbr'.upper()].unique()
mask_type = courses['CRS_TYPE']=='ENRL'
discarded_grades = ['ZZ']
discarded_transfer_crs = ['Y']                          
mask_grade = ~courses['CRS_OFCL_GRD_CD'].isin(discarded_grades)
mask_agg_id = ~courses['EARNED_BFORE_COHORT'].isin(discarded_transfer_crs)                           
mask = mask_grade&mask_type&mask_agg_id
crs_embed_subset = courses[mask]

In [120]:
courses_sample = crs_embed_subset.sample(frac=0.25)

In [121]:
crs_df = pd.DataFrame(courses_sample)
crs_df['agg_id'] = crs_df['CRS_ID'].astype(str)

In [7]:
#courses_sample.columns

In [10]:
#plt.hist(crs_df['agg_id'].value_counts()

In [122]:
embedding_id = list(crs_df['agg_id'].unique())
course_to_id = dict([(name, i) for i, name in enumerate(embedding_id)])

def make_set(df):
    return set(df['agg_id'].map(course_to_id))

#dummy = crs_df.groupby(['PRSN_UNIV_ID','ACAD_TERM_CD'])

In [123]:
#pd.Series(embedding_id).value_counts()

In [124]:
# dummy = crs_df.groupby(['PRSN_UNIV_ID','ACAD_TERM_CD'])
# len(dummy)
agg_course_to_dept = dict(zip(crs_df['agg_id'],crs_df['CRS_SUBJ_DEPT_CD']))

In [126]:
departments = [agg_course_to_dept[identifier] for identifier in embedding_id]
#agg_course_to_dept

In [127]:
unique_students = list(crs_df['PRSN_UNIV_ID'].unique())
#pd.Series(unique_students).unique()
#pd.Series(unique_students).values()

In [128]:
#unique_students = crs_df['PRSN_UNIV_ID'].unique()
np.random.shuffle(unique_students)
n_train = int(0.8*len(unique_students))
train_students = unique_students[:n_train]
valid_students = unique_students[n_train:]
crs_df_train = crs_df[crs_df['PRSN_UNIV_ID'].isin(train_students)]
crs_df_valid = crs_df[crs_df['PRSN_UNIV_ID'].isin(valid_students)]

In [129]:
print(len(crs_df_train.groupby(['PRSN_UNIV_ID','ACAD_TERM_CD'])))
print(len(train_students))
len(unique_students)

516070
101546


126933

In [130]:
print(len(crs_df_valid.groupby(['PRSN_UNIV_ID','ACAD_TERM_CD'])))
len(valid_students)

128754


25387

In [131]:
def train_generator(): 
    negative_courses = crs_df['agg_id'].map(course_to_id)
    n_neg = len(negative_courses)
    while True:
        for (student, term), df in crs_df_train.groupby(['PRSN_UNIV_ID','ACAD_TERM_CD']):
            courses_set = make_set(df)
            if len(courses_set) > 1:
                for crs_1 in courses_set:
                    contexts = []
                    courses_x = []
                    matches = []
                    for crs_2 in courses_set: 
                        x = crs_1
                        y = crs_2
                        if x!=y:
                            context = list(negative_courses.iloc[np.random.choice(n_neg,4)]) + [y]
                            course = 5*[x]
                            match = [0,0,0,0,1]
                            contexts.append(np.array(context).reshape(5,1))
                            courses_x.append(np.array(course).reshape(5,1))
                            matches.append(np.array(match).reshape(5,1))
                    contexts = np.concatenate(contexts, axis=0)
                    courses_x = np.concatenate(courses_x, axis=0)
                    matches = np.concatenate(matches, axis=0)
                    yield [contexts, courses_x], matches
                
def valid_generator(): 
    negative_courses = crs_df['agg_id'].map(course_to_id)
    n_neg = len(negative_courses)
    while True:
        for (student, term), df in crs_df_valid.groupby(['PRSN_UNIV_ID','ACAD_TERM_CD']):
            courses_set = make_set(df)
            if len(courses_set) > 1:
                for crs_1 in courses_set:
                    contexts = []
                    courses_x = []
                    matches = []
                    for crs_2 in courses_set: 
                        x = crs_1
                        y = crs_2
                        if x!=y:
                            context = list(negative_courses.iloc[np.random.choice(n_neg,4)]) + [y]
                            course = 5*[x]
                            match = [0,0,0,0,1]
                            contexts.append(np.array(context).reshape(5,1))
                            courses_x.append(np.array(course).reshape(5,1))
                            matches.append(np.array(match).reshape(5,1))
                    contexts = np.concatenate(contexts, axis=0)
                    courses_x = np.concatenate(courses_x, axis=0)
                    matches = np.concatenate(matches, axis=0)
                    yield [contexts, courses_x], matches

In [23]:
# negative_courses = crs_df['agg_id'].map(course_to_id)
# n_neg = len(negative_courses)
# context = np.random.choice(n_neg,4)
# context = list(negative_courses.iloc[np.random.choice(n_neg,4)]) + [y]
# context

In [24]:
# for dummy in train_generator():
#     print(dummy)

#sum(crs_df['CRS_ID']==19)

In [132]:
embed_dim = 10
optimizer = Adam(lr=0.001)

# def _build_model(course_input):
#     input_ = Input(shape=x.shape[1:], name='Course ids')
#     embed = Embedding(len(embedding_id), embed_dim, name='Course embedding')(input_)
#     output = Dense(len(embedding_id), activation='softmax', name='Course probabilities')(embed) 
    
#     model = Model(inputs=input_,outputs=output, name='Model')
#     model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
#     return model

input_course_ = Input(shape=(1,), name='Course_ids')
input_context_ = Input(shape=(1,), name='contxt')
embed = Embedding(len(embedding_id), embed_dim, name='Course_embedding')(input_course_)
embed2 = Embedding(len(embedding_id), embed_dim, name='Context_embedding')(input_context_)
output = Dot(-1)([embed, embed2])
sigmoid = keras.activations.sigmoid(output)
model = Model(inputs=[input_course_, input_context_],outputs=output, name='Model')
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])

Model: "Model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Course_ids (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
contxt (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
Course_embedding (Embedding)    (None, 1, 10)        91160       Course_ids[0][0]                 
__________________________________________________________________________________________________
Context_embedding (Embedding)   (None, 1, 10)        91160       contxt[0][0]                     
______________________________________________________________________________________________

In [133]:
filepath="skip_gram_weights/skip_gram.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(train_generator(), validation_data=valid_generator(), callbacks=callbacks_list,
          steps_per_epoch = 516070, validation_steps = 128754, epochs = 150)

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 516070 steps, validate for 128754 steps
Epoch 1/150
Epoch 00001: val_loss improved from inf to 1.32448, saving model to skip_gram_weights/skip_gram.hdf5
Epoch 2/150
Epoch 00002: val_loss improved from 1.32448 to 1.28020, saving model to skip_gram_weights/skip_gram.hdf5
Epoch 3/150
Epoch 00003: val_loss improved from 1.28020 to 1.22207, saving model to skip_gram_weights/skip_gram.hdf5
Epoch 4/150
Epoch 00004: val_loss did not improve from 1.22207
Epoch 5/150
Epoch 00005: val_loss did not improve from 1.22207
Epoch 6/150
Epoch 00006: val_loss did not improve from 1.22207
Epoch 7/150
Epoch 00007: val_loss did not improve from 1.22207
Epoch 8/150


KeyboardInterrupt: 

In [None]:
embedding_matrix = model.layers[2].get_weights()[0]
X = embedding_matrix
pca = PCA(n_components = 2)
pca.fit(X)

X_trans = pca.transform(X)

plt.scatter(X_trans[:,0],X_trans[:,1], c=departments)

In [None]:
##dict that will be start date per term code
##df[term_start] = df['ACAD_TERM_CD'].map(dict from above)
## mask_dropdate = (df[drop data] - df[term_start]) < some number of days
## mask = mask_grade | mask_dropdate
## crs_embed_subset = courses[mask]
courses_sample['ACAD_TERM_CD'].unique()

In [None]:
# student_attr = pd.read_excel('/Users/rsciagli/documents/Fall2020/BAR/student_attribute_table_full.xlsx')
# student_attr.head()