# Machine Learning Implementation (Without Sequential Information)

## Connect to GoogleDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/CIS5930_Project/
%ls

/content/drive/.shortcut-targets-by-id/1K9uiTsAwFaNNqySxib53SuBRuxArSWES/CIS5930_Project
'Arunima-CIS 5930 Project: Data Wrangling.ipynb'
'CIS 5930 Project: Data Wrangling.ipynb'
'CIS 5930 Project: ML Model.ipynb'
'CIS 5930 Project: ML Results.ipynb'
 cv_results_LEDE3.pickle
 cv_results_lstm_bi25_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_bi50_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_bi75_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_uni25_embeddings_only_epochs1_top3.pickle
 cv_results_lstm_uni50_embeddings_only_epochs1_top3.pickle
 cv_results_nn2525_embeddings_only_cw_top3_epochs50.pickle
 cv_results_nn2550_embeddings_only_cw_top3_epochs50.pickle
 cv_results_nn5050_embeddings_only_cw_top3_epochs50.pickle
 cv_results_textrank.pickle
 cv_results_top_3_logreg_cw_balanced.pickle
 cv_results_top_3_logreg_default.pickle
 cv_results_top_3_logreg_elasticnet_gridsearch.pickle
 cv_results_top_3_logreg_elasticnet.pickle
 cv_results_top_3_logreg_sent_num_bal.pickle
 

## Install the dependencies 

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


## Helper Functions

return_df_pred_summaries_: returns the predicted summaries given the fixed number of sentences required 

In [None]:
###Sub-function used in return_pred_summaries

def return_greater_than_min_num(arr, thresh=0.5, min_num=1, fix_num_flag=False, fix_num=3):
    
    '''returns top sentences by index numbers in ascending format and according to input
    specifications
    '''
    #want fixed number sentences?
    if fix_num_flag == True:
        idx = np.argsort(arr)[-fix_num:]
        
    #return above model threshold only    
    else:
        idx_prelim = np.where(arr>= thresh)
        
        #filter for minimum number required
        if idx_prelim[0].shape[0] <= min_num:
            idx = np.argsort(arr)[-min_num:]
        else:
            idx = idx_prelim
            
    #return in ascending order
    return sorted(idx)


###Main helper function    
def return_df_pred_summaries( Xy_doc_label, y_pred, df_text, thresh, min_num,
                             return_all=False, fix_num_flag=False, fix_num=3):
    
    '''return list of predicted summaries and additional information if required
    and according to inout specifications'''
    
    #Wrangle to doc label and flattened array of predictions for each article
    df_label_pred = pd.DataFrame({'doc_label': Xy_doc_label.flatten(),
                                                 'y_pred': y_pred.flatten()}) 
    df_label_pred = df_label_pred.groupby('doc_label').agg(list) 

    df_label_pred = df_label_pred.applymap(lambda x: np.array(x))

    #subfunction to lambda
    f = lambda arr: return_greater_than_min_num(arr, thresh=thresh, 
                                    min_num=min_num,fix_num_flag = fix_num_flag, 
                                                            fix_num=fix_num)
    #get sorted index sentence numbers to include in article
    df_label_pred = df_label_pred.applymap(f) 

    #Return predicted summary
          #index is doc label
    df_doc = df_text[df_label_pred.index]     
    
          # return article sentences as list
    pred_summaries = [np.array(df_doc.iloc[j])       
                               [df_label_pred.iloc[j][0]].tolist()                      
                                          for j in range(len(df_label_pred))]
          #join into summary as single string
    pred_summaries = [summ_list if type(summ_list) == str else   
                      ' '.join(summ_list) for summ_list in pred_summaries]  
    
    if return_all == True:
        answer = df_label_pred.values, df_label_pred.index, pred_summaries
    else:
        answer = pred_summaries
    
    return answer

calc_rouge_scores: calculates average Rouge scores across multiple predicted and gold summary pairs

## Supervised Learning Using Only Embedding Information

 ### 1 Logistic Regression Models

In [None]:
import pickle
import pandas as pd
from functions import return_df_pred_summaries
from functions import calc_rouge_scores
from sklearn.linear_model import LogisticRegression

from datetime import datetime as dt

from sklearn.metrics import confusion_matrix

input_filename = 'train_test_set20_embeddings_only.pickle'

#output_file =  'cv_results_top_3_logreg_default.pickle'
#output_file = 'cv_results_top_3_logreg_cw_balanced.pickle'
output_file = 'cv_results_top_3_logreg_elasticnet.pickle'


t1 = dt.now()
print(t1)

data_dict = pd.read_pickle(input_filename)

#Specify model inputs: df, X, y, doc_labels
df = data_dict['df_original']
train_test_set = data_dict['train_test_sets']
#Specify train-test_data for validation        
Xy_doc_label_train = train_test_set[0][0]
Xy_doc_label_test = train_test_set[0][1]
X_train = train_test_set[0][2]
X_test = train_test_set[0][3]
y_train = train_test_set[0][4]
y_test = train_test_set[0][5]

#Define Model
#LogisticRegression(random_state=42)
#LogisticRegression(class_weight='balanced', random_state=42)
model = LogisticRegression(solver='saga', penalty='elasticnet',
                           l1_ratio=0.25, C=0.5, random_state=42)
#Fit model
model.fit(X_train,y_train)
#Predict Model
y_pred = model.predict_proba(X_test)
    
#Convert to binary predictions
y_pred_bin = (y_pred >=0.5)*1

cm = confusion_matrix(y_test, y_pred_bin[:,1], labels=[0,1])

#Return predicted summaries
idx, doc_index, pred_summaries = return_df_pred_summaries(Xy_doc_label_test, 
                                y_pred[:,1], df.text_clean, thresh=0.5, min_num=1, 
                                return_all = True, fix_num_flag=True, fix_num=3)

#Match with gold summaries
df_gold = df.summary_clean[doc_index]
gold_summaries = [' '.join(df_gold .iloc[j]) for j in range(len(pred_summaries))]
summaries_comp = tuple(zip(pred_summaries, gold_summaries))

scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)




results_dict ={'conf_matrix': cm, 'summaries_comp': summaries_comp,
               'sent_index_number': idx, 'Rouge': scores}

with open(output_file, 'wb') as handle:                                     
    pickle.dump(results_dict, handle)

print(scores)

t2 = dt.now()
print(t2)
print(t2-t1)

2022-04-19 14:37:56.739148


  y = column_or_1d(y, warn=True)


{'rouge1': {'recall': 0.3959067671907968, 'precision': 0.5779447801266966, 'f1': 0.41302032283664225}, 'rougeL': {'recall': 0.35069379180120924, 'precision': 0.5178247108721542, 'f1': 0.36775040341776893}}
2022-04-19 14:43:19.155329
0:05:22.416181


 ### 2 Neural Net Models

In [None]:
import pickle
import pandas as pd
from functions import return_df_pred_summaries
from functions import calc_rouge_scores
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from sklearn.metrics import confusion_matrix

from datetime import datetime as dt

input_filename = 'train_test_set20_embeddings_only.pickle'

#output_file = 'cv_results_nn2525_embeddings_only_cw_top3_epochs50.pickle'
#output_file = 'cv_results_nn2550_embeddings_only_cw_top3_epochs50.pickle'
output_file = 'cv_results_nn5050_embeddings_only_cw_top3_epochs50.pickle'


t1 = dt.now()
print(t1)

data_dict = pd.read_pickle(input_filename)

#Specify model inputs: df, X, y, doc_labels
df = data_dict['df_original']
train_test_set = data_dict['train_test_sets']
#Specify train-test_data for validation        
Xy_doc_label_train = train_test_set[0][0]
Xy_doc_label_test = train_test_set[0][1]
X_train = train_test_set[0][2]
X_test = train_test_set[0][3]
y_train = train_test_set[0][4]
y_test = train_test_set[0][5]

#class_weights for imbalanced data
pos_w = int(y_train.shape[0] / sum(y_train==1)[0])
weight_dict = {0:1, 1: pos_w/2}
   
#Define Model
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model = Sequential()
model.add(Dense(50, input_dim=1536, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
    
#Compile Model
model.compile(loss='binary_crossentropy', optimizer='adam', 
metrics=[tf.keras.metrics.SensitivityAtSpecificity(0.5, num_thresholds=1),
             tf.keras.metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)])
#Fit Model
history = model.fit(X_train, y_train, epochs=50, batch_size=32,
                          callbacks=[callback], class_weight=weight_dict) #class_weight=weight_dict
#Predict Model
y_pred = model.predict(X_test)
    
#Convert to binary predictions
y_pred_bin = (y_pred >=0.5)*1

cm = confusion_matrix(y_test, y_pred_bin, labels=[0,1])


#Return predicted summaries
idx, doc_index, pred_summaries = return_df_pred_summaries(Xy_doc_label_test, 
                                y_pred, df.text_clean, thresh=0.5, min_num=1, 
                                return_all = True, fix_num_flag=True, fix_num=3)

#pred_summaries = [' '.join(df.text[doc_index].iloc[j][:3]) for j in range(len(idx))]

#Match with gold summaries
df_gold = df.summary_clean[doc_index]
gold_summaries = [' '.join(df_gold .iloc[j]) for j in range(len(pred_summaries))]
summaries_comp = tuple(zip(pred_summaries, gold_summaries))

scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)



results_dict ={'conf_matrix': cm, 'summaries_comp': summaries_comp,
               'sent_index_number': idx, 'Rouge': scores, 'mod_summary': model.summary()}

with open(output_file, 'wb') as handle:                                     
    pickle.dump(results_dict, handle)

print(scores)

t2 = dt.now()
print(t2)
print(t2-t1)

2022-04-19 15:57:01.256561
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                76850     
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                

 ### 3 TextRank


In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from datetime import datetime as dt

t1 = dt.now()
print(t1)

output_file = 'train_stats_dict_processed_extr_final_5000_inc_pagerank.pickle' 
input_file = 'train_stats_dict_processed_extr_final_5000_.pickle' 
data = pd.read_pickle(input_file )

#Select sentence embeddings only and match to doc label
df_embed = data['df_X'].loc[:,'Sent_BERT_D_0': 'Sent_BERT_D_767']
df_doc_label = pd.DataFrame(data['Xy_doc_label_array'],columns=['doc_label'])
df = pd.concat([df_doc_label, df_embed], axis=1)

#loop through articles (docs)
pagerank_scores_list=[]
error_list = []
doc_num = np.max(data['Xy_doc_label_array']) 
for j in range(doc_num+1):
    
    #calculate cosine similiarity matrix 
    df_doc = df [df.doc_label == j].iloc[:,2:]
    n = df_doc.shape[0]
    cos_matrix = cosine_similarity(df_doc, df_doc)
    f = np.vectorize(lambda x: 0 if x == 1 else 1)
    not_eye = f(np.eye(n,n))
    cos_matrix = cos_matrix * not_eye
    
    #Convert to nx graph
    graph = nx.from_numpy_array(cos_matrix)
    
    #Calculate sentence scores and record error docs
    try:
        scores_arr = np.array(list(nx.pagerank(graph, max_iter=500).values()))
    except:
        scores_arr = np.nan
        error_list.append(j)
   
    pagerank_scores_list.append(scores_arr)
    
pagerank_scores_arr = np.array(pagerank_scores_list)

#store in primary dictionary
data.update({'textrank_scores_arr_per_doc':pagerank_scores_arr })

#save to pickle
with open(output_file, 'wb') as handle:                                     
    pickle.dump(data, handle)

t2=dt.now()
print(t2)
print(t2-t1)

#runtime 4mins50sec for 5000 docs / 29 errors

In [None]:
import pandas as pd
import numpy as np
import pickle
from functions import calc_rouge_scores


input_textrank = 'train_stats_dict_processed_extr_final_5000_inc_pagerank.pickle'
input_test_labels = 'train_test_set20_embeddings_only.pickle'
output_file = 'cv_results_textrank.pickle'

data = pd.read_pickle(input_textrank )

test_labels = pd.read_pickle(input_test_labels)
test_labels = set(test_labels['train_test_sets'][0][1].flatten())

#original df with columns including article / summary text
df = data['df_original']
#add pagerank scores to df
df['textrank_scores'] = data['textrank_scores_arr_per_doc']
#filter for test set
df = df[df.index.isin(test_labels)]
#drop where textrank had errors
df = df.dropna()

#pick top3 sentence by textrank score
df['idx'] = df['textrank_scores'].apply(lambda x: sorted(np.argsort(x)[-3:])).values
idx_arr = df['idx'].values

#convert list of sentences to string for each predicted summary
pred_summaries = [' '.join(np.array(df.text_clean.iloc[j])[idx_arr[j]].tolist())
                  for j in range(len(idx_arr))]

#convert cleaned gold summarysentence lists to string for each summary
df_gold = df.summary_clean
gold_summaries = [' '.join(df_gold .iloc[j]) for j in range(len(pred_summaries))]

#zip each predicted / gold summary pair together and store in another tuple
summaries_comp = tuple(zip(pred_summaries, gold_summaries))

#calculate rouge scores
scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rougeL'], use_stemmer=True)
#store results in dict
results_dict = {'Rouge': scores, 'doc_labels': df.index.tolist(),
                'summaries_comp': summaries_comp}
#add to primary dict
data.update(results_dict)

#save to pickle
with open(output_file, 'wb') as handle:                                     
    pickle.dump(data, handle)
