# Machine Learning Implementation (Using Sequential Information)

## Connect to GoogleDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/CIS5930_Project/
%ls

/content/drive/.shortcut-targets-by-id/1K9uiTsAwFaNNqySxib53SuBRuxArSWES/CIS5930_Project
'Arunima-CIS 5930 Project: Data Wrangling.ipynb'
'CIS 5930 Project: Data Wrangling.ipynb'
'CIS 5930 Project: ML Model.ipynb'
'CIS 5930 Project: ML Results.ipynb'
 cv_results_LEDE3.pickle
 cv_results_lstm_bi25_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_bi50_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_bi75_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_uni25_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_uni50_embeddings_only_epochs1_top3.pickle
 cv_results_nn2525_embeddings_only_cw_top3_epochs50.pickle
 cv_results_nn2550_embeddings_only_cw_top3_epochs50.pickle
 cv_results_nn5050_embeddings_only_cw_top3_epochs50.pickle
 cv_results_textrank.pickle
 cv_results_top_3_logreg_cw_balanced.pickle
 cv_results_top_3_logreg_default.pickle
 cv_results_top_3_logreg_elasticnet_gridsearch.pickle
 cv_results_top_3_logreg_elasticnet.pickle
 cv_results_top_3_logreg_sent_num_bal.pickle
 

## Install the dependencies 

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


## Helper Functions

return_df_pred_summaries_: returns the predicted summaries given the fixed number of sentences required 

In [None]:
###Sub-function used in return_pred_summaries

def return_greater_than_min_num(arr, thresh=0.5, min_num=1, fix_num_flag=False, fix_num=3):
    
    '''returns top sentences by index numbers in ascending format and according to input
    specifications
    '''
    #want fixed number sentences?
    if fix_num_flag == True:
        idx = np.argsort(arr)[-fix_num:]
        
    #return above model threshold only    
    else:
        idx_prelim = np.where(arr>= thresh)
        
        #filter for minimum number required
        if idx_prelim[0].shape[0] <= min_num:
            idx = np.argsort(arr)[-min_num:]
        else:
            idx = idx_prelim
            
    #return in ascending order
    return sorted(idx)


###Main helper function    
def return_df_pred_summaries( Xy_doc_label, y_pred, df_text, thresh, min_num,
                             return_all=False, fix_num_flag=False, fix_num=3):
    
    '''return list of predicted summaries and additional information if required
    and according to inout specifications'''
    
    #Wrangle to doc label and flattened array of predictions for each article
    df_label_pred = pd.DataFrame({'doc_label': Xy_doc_label.flatten(),
                                                 'y_pred': y_pred.flatten()}) 
    df_label_pred = df_label_pred.groupby('doc_label').agg(list) 

    df_label_pred = df_label_pred.applymap(lambda x: np.array(x))

    #subfunction to lambda
    f = lambda arr: return_greater_than_min_num(arr, thresh=thresh, 
                                    min_num=min_num,fix_num_flag = fix_num_flag, 
                                                            fix_num=fix_num)
    #get sorted index sentence numbers to include in article
    df_label_pred = df_label_pred.applymap(f) 

    #Return predicted summary
          #index is doc label
    df_doc = df_text[df_label_pred.index]     
    
          # return article sentences as list
    pred_summaries = [np.array(df_doc.iloc[j])       
                               [df_label_pred.iloc[j][0]].tolist()                      
                                          for j in range(len(df_label_pred))]
          #join into summary as single string
    pred_summaries = [summ_list if type(summ_list) == str else   
                      ' '.join(summ_list) for summ_list in pred_summaries]  
    
    if return_all == True:
        answer = df_label_pred.values, df_label_pred.index, pred_summaries
    else:
        answer = pred_summaries
    
    return answer

calc_rouge_scores: calculates average Rouge scores across multiple predicted and gold summary pairs

## Supervised Learning Including Sequential Information

 ### 1 Logistic Regression Models

In [None]:
import pickle
import pandas as pd
from functions import return_df_pred_summaries
from functions import calc_rouge_scores
from sklearn.linear_model import LogisticRegression

from datetime import datetime as dt

from sklearn.metrics import confusion_matrix

input_filename = 'train_test_set20_embeddings_sent_num.pickle'     

#output_file =  'cv_results_top_3_logreg_sent_num_no_bal.pickle'
output_file = 'cv_results_top_3_logreg_sent_num_bal.pickle'

t1 = dt.now()
print(t1)

data_dict = pd.read_pickle(input_filename)

#Specify model inputs: df, X, y, doc_labels
df = data_dict['df_original']
train_test_set = data_dict['train_test_sets']
#Specify train-test_data for validation        
Xy_doc_label_train = train_test_set[0][0]
Xy_doc_label_test = train_test_set[0][1]
X_train = train_test_set[0][2]
X_test = train_test_set[0][3]
y_train = train_test_set[0][4]
y_test = train_test_set[0][5]

#Define Model
#model = LogisticRegression(random_state=42)
model = LogisticRegression(class_weight='balanced', random_state=42)
#Fit model
model.fit(X_train,y_train)
#Predict Model
y_pred = model.predict_proba(X_test)
    
#Convert to binary predictions
y_pred_bin = (y_pred >=0.5)*1

cm = confusion_matrix(y_test, y_pred_bin[:,1], labels=[0,1])

   
#Return predicted summaries
idx, doc_index, pred_summaries = return_df_pred_summaries(Xy_doc_label_test, 
                                y_pred[:,1], df.text_clean, thresh=0.5, min_num=1, 
                                return_all = True, fix_num_flag=True, fix_num=3)


#Match with gold summaries
df_gold = df.summary_clean[doc_index]
gold_summaries = [' '.join(df_gold .iloc[j]) for j in range(len(pred_summaries))]
summaries_comp = tuple(zip(pred_summaries, gold_summaries))

scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)


results_dict ={'conf_matrix': cm, 'summaries_comp': summaries_comp,
               'sent_index_number': idx, 'Rouge': scores}

with open(output_file, 'wb') as handle:                                     
    pickle.dump(results_dict, handle)

print(scores)

t2 = dt.now()
print(t2)
print(t2-t1)


2022-04-18 17:18:12.228738


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'rouge1': {'recall': 0.5007132798746506, 'precision': 0.6637591207074188, 'f1': 0.501629633281413}, 'rougeL': {'recall': 0.4704132360453848, 'precision': 0.6239325668269886, 'f1': 0.4719740959659388}}
2022-04-18 17:19:14.183477
0:01:01.954739


 ### 2 Long Short Term Memory
 
General Implementation

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import TimeDistributed
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from functions import calc_rouge_scores
from keras.layers import Bidirectional


input_filename = 'train_test_set20_embeddings_only.pickle'
#output_file = 'XXX.pickle'
#output_file = 'cv_results_lstm_uni25_embeddings_only_epochs1_top3.pickle'
#output_file = 'cv_results_lstm_uni50_embeddings_only_epochs1_top3.pickle'
#output_file = 'cv_results_lstm_bi25_embeddings_only_epochs1_top3.pickle'
#output_file = 'cv_results_lstm_bi50_embeddings_only_epochs1_top3.pickle'
output_file = 'cv_results_lstm_bi75_embeddings_only_epochs1_top3.pickle'

data_dict = pd.read_pickle(input_filename)

df = data_dict['df_original']

#step 1: process data for ltsm input

#convert to numpy array
to_array = lambda x: np.array(x)
df.text_embedding = df.text_embedding.apply(to_array)
df.labels= df.labels.apply(to_array)
df.text_embedding = df.text_embedding.apply(lambda x: x.reshape(1, x.shape[0],x.shape[1]))
df.labels = df.labels.apply(lambda x: x.reshape(1, len(x),1))

#train_test split
train_doc_labels = set(data_dict['train_test_sets'][0][0].flatten())
mask_train = np.array([x in train_doc_labels for x in df.index]) 

X_train = df.text_embedding[mask_train].tolist()
y_train = df.labels[mask_train].tolist()

X_test = df.text_embedding[~mask_train].tolist()
y_test = df.labels[~mask_train].tolist()


# define LSTM
model = Sequential()

#model.add(LSTM(25, input_shape=(None, 768), return_sequences=True, dropout=0))
#model.add(LSTM(50, input_shape=(None, 768), return_sequences=True, dropout=0))

#model.add(Bidirectional(LSTM(25, return_sequences=True, dropout=0), input_shape=(None, 768)))
#model.add(Bidirectional(LSTM(50, return_sequences=True, dropout=0), input_shape=(None, 768)))
model.add(Bidirectional(LSTM(75, return_sequences=True, dropout=0), input_shape=(None, 768)))


model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=[tf.keras.metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)])


# train LSTM
training_loss =[]
training_metric = []
for j in range(len(X_train)):   
    X, y = X_train[j], y_train[j]
    history = model.fit(X, y, epochs=1, batch_size=1)
    training_loss.append(history.history['loss'])
    
# evaluate LSTM
y_pred_list =[]
idx_list=[]
for j in range(len(X_test)):
    X= X_test[j]
    y_pred = model.predict(X, verbose=0)
    idx = np.argsort(y_pred[0].flatten())[-3:]
    idx = sorted(idx)
    y_pred_list.append(y_pred)
    idx_list.append(idx)

    
#retrieve summary pairs
doc_index = df.index[~mask_train]
pred_summaries = [' '.join(np.array(df.text_clean[doc_index].iloc[j])[np.array(idx_list[j])].tolist()) 
                  for j in range(len(idx_list))]
df_gold = df.summary_clean[doc_index]
gold_summaries = [' '.join(df_gold .iloc[j]) for j in range(len(pred_summaries))]
summaries_comp = tuple(zip(pred_summaries, gold_summaries))


#calculate rouge score
scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)

results_dict ={'summaries_comp': summaries_comp,
               'sent_index_number': idx, 'Rouge': scores, 'mod_summary': model.summary()}

with open(output_file, 'wb') as handle:                                     
    pickle.dump(results_dict, handle)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, None, 150)        506400    
 l)                                                              
                                                                 
 time_distributed_1 (TimeDis  (None, None, 1)          151       
 tributed)                                                       
                                                                 
Total params: 506,551
Trainable params: 506,551
Non-trainable params: 0
_________________________________________________________________


 ### 3 LEDE3
 
Implementation

In [None]:
import pickle
import pandas as pd
import numpy as np
from functions import calc_rouge_scores

from datetime import datetime as dt

input_filename = 'train_test_set20_embeddings_sent_num.pickle'             

output_file = 'cv_results_LEDE3.pickle'

t1 = dt.now()
print(t1)

data_dict = pd.read_pickle(input_filename)

#Specify model inputs: df, X, y, doc_labels
df = data_dict['df_original']

#Specify train-test_data for validation        
train_doc_labels = set(data_dict['train_test_sets'][0][0].flatten())
mask_train = np.array([x in train_doc_labels for x in df.index]) 
mask_test = ~mask_train

#Return predicted summary
df_doc = df.text_clean[mask_test]
    
pred_summaries = [np.array(df_doc.iloc[j])[:3]
                                          for j in range(len(df_doc))]

pred_summaries = [summ_list if type(summ_list) == str else 
                      ' '.join(summ_list) for summ_list in pred_summaries]

#Match with gold summaries
df_gold = df.summary_clean[mask_test]
gold_summaries = [' '.join(df_gold .iloc[j]) for j in range(len(pred_summaries))]
summaries_comp = tuple(zip(pred_summaries, gold_summaries))

scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)

results_dict ={'summaries_comp': summaries_comp,'Rouge': scores}

with open(output_file, 'wb') as handle:                                     
    pickle.dump(results_dict, handle)

print(scores)

t2 = dt.now()
print(t2)
print(t2-t1)

2022-04-18 18:26:10.084973
{'rouge1': {'recall': 0.5881750587058888, 'precision': 0.7569710610627568, 'f1': 0.5888699121981296}, 'rougeL': {'recall': 0.5719198985899687, 'precision': 0.735474681383238, 'f1': 0.5738612624177114}}
2022-04-18 18:26:36.621890
0:00:26.536917
