Code is partly adapted from work by Anushka Prakash and Harish Tayyar Madabushi.
2020. Incorporating count-based features into pre-trained models for improved stance detection. arXiv preprint 
arXiv:2010.09078.https://github.com/Anushka-Prakash/RumourEval-2019-Stance-Detection/

Additionally code is adapted and influenced from HuggingFace.

# MLP

In [3]:
#Importing necessary libraries

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy

import torch

from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer

from torch                            import nn, optim
from torch.utils                      import data

#Seeding for deterministic results
RANDOM_SEED = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
HIDDEN_LAYER_UNITS = 128

CLASS_NAMES = ['scam', 'not_scam']
EPOCHS      = 55

## Read and Process Scam Emails

In [4]:
filepath = "/content/drive/MyDrive/Colab Notebooks/NLU-Project/Dataset/fradulent_emails.txt"
with open(filepath, 'r',encoding="latin1") as file:
    data = file.read()
    
fraud_emails = data.split("From r")

print("Successfully loaded {} spam emails!".format(len(fraud_emails)))

Successfully loaded 3978 spam emails!


In [5]:
import email

def extract_messages(df):
    messages = []
    subject = []
    for item in df["message"]:
        e = email.message_from_string(item)    
        # get message body  
        message_body = e.get_payload()
        message_subject = e.get('Subject')
        messages.append(message_body)
        subject.append(message_subject)
    print("Successfully retrieved message body from e-mails!")
    d = {'emailText':messages,'subject':subject}
    return d

In [6]:
fraud_bodies = extract_messages(pd.DataFrame(fraud_emails,columns=["message"],dtype=str))
#fraud_bodies_df = pd.DataFrame(fraud_bodies[1:], columns=['emailText'])
fraud_bodies_df = pd.DataFrame(fraud_bodies, columns=['emailText', 'subject'])
fraud_bodies_df['label'] = 'scam'
fraud_bodies_df['labelValue'] = '1'
fraud_bodies_df.drop([0], inplace = True)
fraud_bodies_df.reset_index(drop=True, inplace = True)
fraud_bodies_df

Successfully retrieved message body from e-mails!


Unnamed: 0,emailText,subject,label,labelValue
0,FROM:MR. JAMES NGOLA.\nCONFIDENTIAL TEL: 233-2...,URGENT BUSINESS ASSISTANCE AND PARTNERSHIP,scam,1
1,"Dear Friend,\n\nI am Mr. Ben Suleman a custom ...",URGENT ASSISTANCE /RELATIONSHIP (P),scam,1
2,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,GOOD DAY TO YOU,scam,1
3,FROM HIS ROYAL MAJESTY (HRM) CROWN RULER OF EL...,GOOD DAY TO YOU,scam,1
4,"Dear sir, \n \nIt is with a heart full of hope...",I Need Your Assistance.,scam,1
...,...,...,...,...
3972,"Atten: My Dear ,\n \nI have Paid the fee for y...",=?iso-8859-1?Q?CONTACT=20GLOBAL=20MAX=20SHIPIN...,scam,1
3973,"[[Content-Type, Content-Transfer-Encoding], [C...",TREAT AS URGENT.,scam,1
3974,"[[Content-Type, Content-Transfer-Encoding], [C...",From Dr Usman Ibrahim / Mr Wahid Yoffe property.,scam,1
3975,"\nBeloved in the Lord Jesus Christ, PLEASE END...",My Beloved In Christ.,scam,1


## Read and Process Benign Emails

In [7]:
filepath = "/content/drive/MyDrive/Colab Notebooks/NLU-Project/Dataset/emails.csv"

emails = pd.read_csv(filepath)

print("Successfully loaded {} rows and {} columns!".format(emails.shape[0], emails.shape[1]))
print(emails.head())

Successfully loaded 517401 rows and 2 columns!
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [8]:
print(emails.loc[0]["message"])

Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 


In [9]:
import email

def extract_messages(df):
    messages = []
    subject = []
    for item in df["message"]:
        e = email.message_from_string(item)    
        # get message body  
        message_body = e.get_payload()
        message_subject = e.get('Subject')
        messages.append(message_body)
        subject.append(message_subject)
    print("Successfully retrieved message body from e-mails!")
    d = {'emailText':messages,'subject':subject}
    return d

In [10]:
import random

bodies_df = extract_messages(pd.DataFrame(emails, columns=["message"],dtype=str))
bodies_df = pd.DataFrame(bodies_df, columns=['emailText', 'subject'])

bodies_df['label'] = 'not_scam'
bodies_df['labelValue'] = '0'
pd.set_option('display.max_colwidth', 300)

bodies_df = bodies_df.sample(n = 4000)

Successfully retrieved message body from e-mails!


In [11]:
bodies_df.reset_index(drop=True, inplace=True)
bodies_df

Unnamed: 0,emailText,subject,label,labelValue
0,"---------------------- Forwarded by Richard Shapiro/NA/Enron on 02/26/2001 \n08:55 AM ---------------------------\nFrom: James D Steffes on 02/26/2001 08:07 AM\nTo: Robert Frank/NA/Enron@Enron, Richard Shapiro/NA/Enron@Enron\ncc: \n\nSubject: February 22, 2001\n\n\n----- Forwarded by James D St...","February 22, 2001",not_scam,0
1,"Richard,\n\nThanks for the voicemail and email. Congrats to you also. Hopefully we will \nget a chance to work together more often. I'll give you a call on my next \ntrip to London.\n\nMike\n\n\n\n\n\nRichard Harper\n08/01/2000 02:52 AM\nTo: Mike McConnell\ncc: \nSubject: Congratulations.\n\...",Re: Congratulations.,not_scam,0
2,"Jacques,\n\nHere is Larry Lewter's response to my request for more documentation to \nsupport the $15,000. As you will read below, it is no longer an issue. I \nthink that was the last issue to resolve. \n\nPhillip\n\n\n---------------------- Forwarded by Phillip K Allen/HOU/ECT on 03/19/2001 ...",RE: Buyout,not_scam,0
3,"\nNYSE Moves To Delist Enron Stock\nDow Jones News Service, 01/15/2002\nNYSE says delisting Enron shares - UPDATE\nAFX News, 01/15/2002\n\nWriter Of Enron Warning Letter Worked With Ex-CFO Fastow\nDow Jones Energy Service, 01/15/2002\n\nEnron DIP Financing Now Likely Halved To About $700M\nDow J...",Enron Mentions -- 01/15/02,not_scam,0
4,"\n =09 Markets =09 [IMAGE] =09DJ30 =09 =099,908.16=09 =09-68.29=09 =09-0.68%=\n =09 [IMAGE] =09NASDAQ =09 =091,910.94=09 =09-23.47 =09 =09-1.21%=09 [IMA=\nGE] =09S&P 500 =09 =091,145.18=09 =09-5.87=09 =09-0.50%=09 [IMAGE] =09US m=\narkets close in 4:27 =09=09=09=09=09\n=09=09=09=09=09=09=09=...","(ClearStation) : Portfolio Update, Tue Nov 20 2001",not_scam,0
...,...,...,...,...
3995,"\nWe will have our usual conference call this coming Monday @11:00AM (CST). For those of you in Houston, we will be in room 4793. The dial-in details are listed below:\n\nDial-in number: 888/422-7141\nHost/Participant Code: 203099\nInternatioanl Dial-in number: 334/832-4208\n\nMona Petrochk...",EBS GA Conference Call,not_scam,0
3996,It is my understanding that this letter will be sent to the CPUC today. I \ndon't have any reservations with the content or format.\n\nJim\n\n\n----- Forwarded by James D Steffes/NA/Enron on 11/20/2000 08:05 AM -----\n\n\tRichard B Sanders@ECT\n\t11/20/2000 07:51 AM\n\t\t \n\t\t To: James D Ste...,Draft letter to the CPUC,not_scam,0
3997,"---------------------- Forwarded by Dan Masters/HOU/ECT on 03/07/2001 09:37 \nAM ---------------------------\n\n\nDan Masters\n03/05/2001 09:34 AM\nTo: ""Joseph McKechnie"" <jmckechnie@cabotlng.com> @ ENRON\ncc: ""Christopher Skinner"" <cskinner@cabotlng.com>@ENRON, \nJaime.Sanabria@enron.com@ENRON,...","Re: Feb 12 delivery of ""Methane Arctic""",not_scam,0
3998,"FYI - to keep you informed of the ongoing changes with the physical gas monetization issues. Botttom line - I believe that the remaining amount will be very small if not zero. The atttorney's are sending me names but the ones Barbara mentioned on the phone are the large ones; Aquila, Sempra, Du...","FW: Master Netting Agreement Assignments...Identification of legal\n entities comprising your assigned ""houses""",not_scam,0


In [12]:
not_scam_emails = bodies_df.copy()
scam_emails = fraud_bodies_df.copy()

In [13]:
frames = [not_scam_emails, scam_emails]

resultDf = pd.concat(frames)                                                      #Concatenating
resultDf  = resultDf.replace(np.nan, '', regex=True)

In [14]:
resultDf

Unnamed: 0,emailText,subject,label,labelValue
0,"---------------------- Forwarded by Richard Shapiro/NA/Enron on 02/26/2001 \n08:55 AM ---------------------------\nFrom: James D Steffes on 02/26/2001 08:07 AM\nTo: Robert Frank/NA/Enron@Enron, Richard Shapiro/NA/Enron@Enron\ncc: \n\nSubject: February 22, 2001\n\n\n----- Forwarded by James D St...","February 22, 2001",not_scam,0
1,"Richard,\n\nThanks for the voicemail and email. Congrats to you also. Hopefully we will \nget a chance to work together more often. I'll give you a call on my next \ntrip to London.\n\nMike\n\n\n\n\n\nRichard Harper\n08/01/2000 02:52 AM\nTo: Mike McConnell\ncc: \nSubject: Congratulations.\n\...",Re: Congratulations.,not_scam,0
2,"Jacques,\n\nHere is Larry Lewter's response to my request for more documentation to \nsupport the $15,000. As you will read below, it is no longer an issue. I \nthink that was the last issue to resolve. \n\nPhillip\n\n\n---------------------- Forwarded by Phillip K Allen/HOU/ECT on 03/19/2001 ...",RE: Buyout,not_scam,0
3,"\nNYSE Moves To Delist Enron Stock\nDow Jones News Service, 01/15/2002\nNYSE says delisting Enron shares - UPDATE\nAFX News, 01/15/2002\n\nWriter Of Enron Warning Letter Worked With Ex-CFO Fastow\nDow Jones Energy Service, 01/15/2002\n\nEnron DIP Financing Now Likely Halved To About $700M\nDow J...",Enron Mentions -- 01/15/02,not_scam,0
4,"\n =09 Markets =09 [IMAGE] =09DJ30 =09 =099,908.16=09 =09-68.29=09 =09-0.68%=\n =09 [IMAGE] =09NASDAQ =09 =091,910.94=09 =09-23.47 =09 =09-1.21%=09 [IMA=\nGE] =09S&P 500 =09 =091,145.18=09 =09-5.87=09 =09-0.50%=09 [IMAGE] =09US m=\narkets close in 4:27 =09=09=09=09=09\n=09=09=09=09=09=09=09=...","(ClearStation) : Portfolio Update, Tue Nov 20 2001",not_scam,0
...,...,...,...,...
3972,"Atten: My Dear ,\n \nI have Paid the fee for your Cheque Draft.Because the manager of EcoBank\nBenin told me that before the check will get to you that it willexpire.\nSo i told him to cash $850,000.00 however all the necessary arrangement\nof delivering the $850,000.00 in cash was made with G...",=?iso-8859-1?Q?CONTACT=20GLOBAL=20MAX=20SHIPING=20COMPANY?=,scam,1
3973,"[[Content-Type, Content-Transfer-Encoding], [Content-Type, Content-Transfer-Encoding]]",TREAT AS URGENT.,scam,1
3974,"[[Content-Type, Content-Transfer-Encoding], [Content-Type, Content-Transfer-Encoding]]",From Dr Usman Ibrahim / Mr Wahid Yoffe property.,scam,1
3975,"\nBeloved in the Lord Jesus Christ, PLEASE ENDEAVOUR TO USE IT FOR THE\nCHILDREN OF GOD.\n\nMy name is Mother Doris Killam 63years old woman from United States of\nAmerica. I am married to Engineer Pitt Killam who till his death Worked\nwith Willbros, a U.S oil Engineering firm here in Nigeria, ...",My Beloved In Christ.,scam,1


In [15]:
from pandas.api.types import is_list_like
def clean_email_data(df):
  for index, row in df.iterrows():
    if is_list_like(row.emailText):
      df.drop([index], inplace = True)

clean_email_data(resultDf)

In [16]:
resultDf["labelValue"] = resultDf["labelValue"].astype(str).astype(int)
resultDf

Unnamed: 0,emailText,subject,label,labelValue
0,"---------------------- Forwarded by Richard Shapiro/NA/Enron on 02/26/2001 \n08:55 AM ---------------------------\nFrom: James D Steffes on 02/26/2001 08:07 AM\nTo: Robert Frank/NA/Enron@Enron, Richard Shapiro/NA/Enron@Enron\ncc: \n\nSubject: February 22, 2001\n\n\n----- Forwarded by James D St...","February 22, 2001",not_scam,0
1,"Richard,\n\nThanks for the voicemail and email. Congrats to you also. Hopefully we will \nget a chance to work together more often. I'll give you a call on my next \ntrip to London.\n\nMike\n\n\n\n\n\nRichard Harper\n08/01/2000 02:52 AM\nTo: Mike McConnell\ncc: \nSubject: Congratulations.\n\...",Re: Congratulations.,not_scam,0
2,"Jacques,\n\nHere is Larry Lewter's response to my request for more documentation to \nsupport the $15,000. As you will read below, it is no longer an issue. I \nthink that was the last issue to resolve. \n\nPhillip\n\n\n---------------------- Forwarded by Phillip K Allen/HOU/ECT on 03/19/2001 ...",RE: Buyout,not_scam,0
3,"\nNYSE Moves To Delist Enron Stock\nDow Jones News Service, 01/15/2002\nNYSE says delisting Enron shares - UPDATE\nAFX News, 01/15/2002\n\nWriter Of Enron Warning Letter Worked With Ex-CFO Fastow\nDow Jones Energy Service, 01/15/2002\n\nEnron DIP Financing Now Likely Halved To About $700M\nDow J...",Enron Mentions -- 01/15/02,not_scam,0
4,"\n =09 Markets =09 [IMAGE] =09DJ30 =09 =099,908.16=09 =09-68.29=09 =09-0.68%=\n =09 [IMAGE] =09NASDAQ =09 =091,910.94=09 =09-23.47 =09 =09-1.21%=09 [IMA=\nGE] =09S&P 500 =09 =091,145.18=09 =09-5.87=09 =09-0.50%=09 [IMAGE] =09US m=\narkets close in 4:27 =09=09=09=09=09\n=09=09=09=09=09=09=09=...","(ClearStation) : Portfolio Update, Tue Nov 20 2001",not_scam,0
...,...,...,...,...
3968,"\n\nAttention,\n\nDo not be offended if by any way this email is offensive to your personal\nbeing; well,my name is John Davids, I work with a reputable organisation =\nhere\nin London-England. I am contacting you based on investing certain amount =\nof\nmonies on properties allocation. For the ...",Re: eMAIL From John Davids,scam,1
3970,"Greetings,\n\nI am Fernando Gonzalez. I represent a top executive in Spain. \nI have a very sensitive and private brief to ask for your partnership\nto re-profile HUGE AMOUNT of funds runnning into Millions of Euros. \n\nI will give the details, but in summary, the funds are coming via a\nbank ...","REF: 14TH SEPT., 2007",scam,1
3971,"Dear Friend,\n=20\nI, Dr. Jackson Gaius-Obaseki, the formal Group Managing=20\nDirector of Nigerian National Petroleum Corporation(NNPC), in=20\nconjunction with my colleague, Engr. Funsho Kupolokun also formal GMD=20\nNNPC, write this letter to you as an introduction and extention of our=20\nh...",From: Dr. Jackson Gaius-Obaseki,scam,1
3972,"Atten: My Dear ,\n \nI have Paid the fee for your Cheque Draft.Because the manager of EcoBank\nBenin told me that before the check will get to you that it willexpire.\nSo i told him to cash $850,000.00 however all the necessary arrangement\nof delivering the $850,000.00 in cash was made with G...",=?iso-8859-1?Q?CONTACT=20GLOBAL=20MAX=20SHIPING=20COMPANY?=,scam,1


In [17]:
trainDf, devDf, testDf = np.split(resultDf.sample(frac=1, random_state=42), 
                       [int(.6*len(resultDf)), int(.8*len(resultDf))])

In [18]:
testDf

Unnamed: 0,emailText,subject,label,labelValue
3162,"Greetings \n \nI am Barr Charles Brown, a Canadian attorney based in \nManchester, United Kingdom and the personal attorney to Late Mr. Mark \nMichelle, a French National. Late Mr. Mark Michelle until his untimely \ndeath was a private oil consultant/contractor with the British \nPetroleum Compa...",I AWAIT YOUR RESPONSE,scam,1
72,"LADY MARYAM ABACHA \nABACHA COURT \nGIDADO ROAD \nKANO - NIGERIA \n\nATTENTION:C.E.O \n\nI am lady Maryam Abacha, wife of the late General Sani \nAbacha,ex-military Head of State of the Federal Republic of Nigeria who \ndied on the 8th of June 1998 of heart problems.\n\nI contacted you because o...",URGENT ASSISTANCE,scam,1
1721,"\nBarclays Bank Plc \n\n65/66 St. Mary Axe \n\nLondon EC3A 8LE\n\n\n\nI am Mr. Gary Adams, Senior Credit Officer, Barclays Bank Plc London. I am writing following an opportunity in my office that will be of immense benefit to both of us.\n\n\n\nIn my department we discovered an abandoned sum of ...",Need to hear from you please -From Gary Adams,scam,1
2006,"Eric Wong.\nUnion Bank Of China,\n99 Queens Road Central\nHong Kong,\n\nDear Friend,\n\nI write you in good faith, hoping that you will treat this letter as a\ndesperate search for assistance in a matter that shall benefit both\nofus.A random search based on your locality from the Internet produ...",feedback,scam,1
3446,"Greetings!\n\nDear Sir,\n\nGreetings and compliement of the season!\n\nIn due consideration and trust,I come to you for a profitable business\ntransaction that might interest you.My Name is Zedex Calvaho from the\nfamous Calvaho Family.Calvaho Family is a Polygamous Family and the 6th\nRichest F...",Greetings,scam,1
...,...,...,...,...
790,>From princess ifeoma\nThe Palace of King of Ogoni Kingdom=2C\nOgoni Oil producing community=2C\nRivers State Nigeria=2E\nPRINCESS Email ADDRESS=3Aprincess=5Fifeoma201=40hotmail=2Ecom\n\nMy lawyer phone number\n234-8033072775\nEmail=3A him for more information please\n\n\nDear Sir=2C I am Prince...,GOOD DAY,scam,1
2593,"FROM: MR CHEUNG PUI\n\ncheugpuixx7@yahoo.com.hk \n\nDear Friend,\n\nLet me start by introducing myself. I am Mr. Cheung Pui director of \noperations of the Hang Seng Bank Ltd.I have an obscured business \nsuggestion for you.\n\nBefore the U.S and Iraqi war our client ""Major Fadi Basem who was \...",HELLO SIR,scam,1
2655,"Dear,\n\nI am the head of Accounts and Audit Department of Bank of Africa,\nOuagadougou . I decided to contact you after a careful thought that you may \nbe capable of handling this business transaction which i explained below;\n\nIn my department we discovered an abandoned sum of $10.5m US doll...",DR .ABU TOPIA,scam,1
2881,"\n\nMy Dear ,\n\nI'm glad to inform you about my success in getting those funds transferred under\nthe cooperation of a new partner from bangkok.\n Presently i'm in bangkok for investment projects with my own share of the total\nsum. mean while,i never forget your past efforts and attempts to as...",this is for you,scam,1


In [19]:
trainDf['TextSrcInre'] = trainDf['subject'].str.cat(trainDf['emailText'],sep=" ")
devDf['TextSrcInre']   = devDf['subject'].str.cat(devDf['emailText'],sep=" ")
testDf['TextSrcInre']  = testDf['subject'].str.cat(testDf['emailText'],sep=" ")
trainDf

Unnamed: 0,emailText,subject,label,labelValue,TextSrcInre
282,Jeff wouldn't be able to leave until late the 23d or early on the 24th and \nwould need to be back by late on the 27th,,not_scam,0,Jeff wouldn't be able to leave until late the 23d or early on the 24th and \nwould need to be back by late on the 27th
1335,"Mark,\nI would appreciate your help in nudging an Online FERC filing. Our traders\nfeel somewhat hamstrung that they can't use Enron Online because of affiliate\nrestrictions in our tariffs. The concern is particularly heightened as we\nplan for the uncertaintly in the markets this summer. A ...",PGE Access to Enron Online - FERC Filing,not_scam,0,"PGE Access to Enron Online - FERC Filing Mark,\nI would appreciate your help in nudging an Online FERC filing. Our traders\nfeel somewhat hamstrung that they can't use Enron Online because of affiliate\nrestrictions in our tariffs. The concern is particularly heightened as we\nplan for the unc..."
3103,"GRAND FINANCE AND TRUST BANK.\nRUE CLUB DE L' AMITIE,\n0251 B.P. 1625 \nCOTONOU,\nBENIN REPUBLIC.\nTELEX: 5394\nSWIFT: GRAFITR CTNOU\n \n Dear friend,\n \n RE: TRANSFER OF US$28.5MILLION.\n\nCompliments Of The \nDay, Permit me t...",Greetings from Mr.Johnson,scam,1,"Greetings from Mr.Johnson GRAND FINANCE AND TRUST BANK.\nRUE CLUB DE L' AMITIE,\n0251 B.P. 1625 \nCOTONOU,\nBENIN REPUBLIC.\nTELEX: 5394\nSWIFT: GRAFITR CTNOU\n \n Dear friend,\n \n RE: TRANSFER OF US$28.5MILLION.\n\nCompliments..."
1147,FROM=3A Dr=2E John Mikado=2E\nDirector of Projects=2C DME=2C South Africa=2E\n\nI am Dr=2E John Mikado=2C Director of projects Implementation South Africa Department of Minerals & Energy=2E First and foremost=2C I apologized using this medium to reach you for a transaction =2F business of this ...,"Partnership Assistance, Please Treat ASAP.",scam,1,"Partnership Assistance, Please Treat ASAP. FROM=3A Dr=2E John Mikado=2E\nDirector of Projects=2C DME=2C South Africa=2E\n\nI am Dr=2E John Mikado=2C Director of projects Implementation South Africa Department of Minerals & Energy=2E First and foremost=2C I apologized using this medium to reach ..."
3375,"Caspar J. Justin Solicitors\n86 Market Place, Mansfield NG10 7HR,United Kingdom\nTel: +44-701-114-9174 Fax: +44-871-264-3736 \nEmail: caspar_j_justin@LawDr.net\n \n \n\n\nDear, \n\n ...",Re: Next of Kin,scam,1,"Re: Next of Kin Caspar J. Justin Solicitors\n86 Market Place, Mansfield NG10 7HR,United Kingdom\nTel: +44-701-114-9174 Fax: +44-871-264-3736 \nEmail: caspar_j_justin@LawDr.net\n \n \n\n\nDe..."
...,...,...,...,...,...
2590,"Keith,\n\nCould you please fed ex our mail to us on this Tuesday\n(and include Monday's mail) for 2nd day delivery. \nWe're in St. Louis and the address and credit card is\nbelow.\n\nShanna Halperin\nc/o Laurie Chod\n159 Ladue Farm Rd\nSt. Louis, MO 63141\n#314-434-4860\n\nAX 3713 834545 61029, ...",misc,not_scam,0,"misc Keith,\n\nCould you please fed ex our mail to us on this Tuesday\n(and include Monday's mail) for 2nd day delivery. \nWe're in St. Louis and the address and credit card is\nbelow.\n\nShanna Halperin\nc/o Laurie Chod\n159 Ladue Farm Rd\nSt. Louis, MO 63141\n#314-434-4860\n\nAX 3713 834545 61..."
1976,"\n\nSue Mara\nEnron Corp.\nTel: (415) 782-7802\nFax:(415) 782-7854\n----- Forwarded by Susan J Mara/NA/Enron on 06/26/2001 10:24 PM -----\n\n\t""Fairchild, Tracy"" <tracy.fairchild@edelman.com>\n\t06/26/2001 05:04 PM\n\t\t \n\t\t To: ""Allen, Stevan"" <stevan.allen@edelman.com>, AReM <arem@electric....","First Story from DA News Conference is from Bloomberg News: ""Cali\n\tfornia Power Users Should Have Provider Choice""",not_scam,0,"First Story from DA News Conference is from Bloomberg News: ""Cali\n\tfornia Power Users Should Have Provider Choice"" \n\nSue Mara\nEnron Corp.\nTel: (415) 782-7802\nFax:(415) 782-7854\n----- Forwarded by Susan J Mara/NA/Enron on 06/26/2001 10:24 PM -----\n\n\t""Fairchild, Tracy"" <tracy.fairchild@..."
1446,MR ADU KOFFI=2C\nAKWABA CHAMBERS\nFLAT 213=2CTESHIE ESTATE\nACCRA GHANA=2E\n\nDear=2C\nI am Mr=2E Koffi Adu=2C a lawyer=2Fconsultant for an African eminent personal who prefer to remain anonymous=2E He has shown proves to me=2C which I verified professionally that his claims are true=2C sincere...,YOUR ASSISTANCE PLEASE,scam,1,YOUR ASSISTANCE PLEASE MR ADU KOFFI=2C\nAKWABA CHAMBERS\nFLAT 213=2CTESHIE ESTATE\nACCRA GHANA=2E\n\nDear=2C\nI am Mr=2E Koffi Adu=2C a lawyer=2Fconsultant for an African eminent personal who prefer to remain anonymous=2E He has shown proves to me=2C which I verified professionally that his cla...
3165,"\n\n\n\nCHINA MERCHANT BANK.\nDES BOUEX RD. BRANCH,\nCENTRAL HONG KONG,\nHONK KONG. \n\n\nNOTIFICATION AS NEXT OF KIN/BENEFICIARY \n\nDATE: 18/10/2006\n\nI am Mr.Joseph Yun, credit officer of the China Merchant Bank.I have a concealed business suggestion for \nyou.\n\n\nBefore the U.S and Iraqi...",GOOD DAY,scam,1,"GOOD DAY \n\n\n\nCHINA MERCHANT BANK.\nDES BOUEX RD. BRANCH,\nCENTRAL HONG KONG,\nHONK KONG. \n\n\nNOTIFICATION AS NEXT OF KIN/BENEFICIARY \n\nDATE: 18/10/2006\n\nI am Mr.Joseph Yun, credit officer of the China Merchant Bank.I have a concealed business suggestion for \nyou.\n\n\nBefore the U.S ..."


In [20]:
x_train = trainDf['TextSrcInre'].tolist()
y_train = trainDf['labelValue'].tolist()


x_dev  = devDf['TextSrcInre'].tolist()
y_dev  = devDf['labelValue'].tolist()

x_test = testDf['TextSrcInre'].tolist()
y_test = testDf['labelValue'].tolist()

#Instantiating TfidfVectorizer object and fitting it on the training set
tfidf         = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))
x_train_feats = tfidf.fit(x_train)

print(x_train_feats)
print(len(x_train_feats.get_feature_names_out()))

x_train_transform = x_train_feats.transform(x_train)
#Converting the TF-IDF matrix to tensor
tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(x_train_transform)).float()
print(x_train_transform.shape)

#Tranforming the development and test data to tf-idf matrix
x_dev  = tfidf.transform(x_dev)
x_test = tfidf.transform(x_test)

x_dev  = torch.tensor(scipy.sparse.csr_matrix.todense(x_dev)).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()

TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2))
21759
(3735, 21759)


In [21]:
#Converting prections for train, dev and test data to tensors
y_train = torch.tensor(y_train)
y_dev   = torch.tensor(y_dev)
y_test  = torch.tensor(y_test)

In [22]:
class Tfidf_Nn(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(len(tfidf.get_feature_names_out()), HIDDEN_LAYER_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_LAYER_UNITS, len(CLASS_NAMES))
        self.dropout = nn.Dropout(0.1)
        
        # Defining tanh activation and softmax output 
        self.tanh    = nn.Tanh()                                     #Using tanh as it performed better than ReLu during hyper-param optimisation
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of the below operations
        x = self.hidden(x)
        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)
        
        #returning the output from hidden layer and the output layer
        return  y, z

In [23]:
#Defining the model
model = Tfidf_Nn()

weights       = [10.0, 10.0]
class_weights = torch.FloatTensor(weights)
criterion     = nn.CrossEntropyLoss(weight = class_weights)


# Forward pass, get our logits
hidden_state_output, classfier_output = model(tfidf_transform_tensor)
print(classfier_output)
print(classfier_output[0].shape)

loss = criterion(classfier_output, y_train)

loss.backward()

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.02)

tensor([[0.5224, 0.4776],
        [0.5220, 0.4780],
        [0.5223, 0.4777],
        ...,
        [0.5225, 0.4775],
        [0.5230, 0.4770],
        [0.5210, 0.4790]], grad_fn=<SoftmaxBackward0>)
torch.Size([2])


In [25]:
#Training the model on training data and evaluating it on development set
#%%time
def train_model():
  train_losses = []
  dev_losses = []
  dev_accuracies = []

  for e in range(EPOCHS):
    correct_predictions = 0
    optimizer.zero_grad()

    hidden_layer_output, classifier_output = model.forward(tfidf_transform_tensor)

    loss = criterion(classifier_output, y_train)
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()
    with torch.no_grad():
        model.eval()

        #Getting hidden layer and softmax output from model for dev data
        hidden_layer_output, classifier_output = model(x_dev)
        
        #Calculating loss
        dev_loss = criterion(classifier_output, y_dev)
        dev_losses.append(dev_loss)

        #Calculating values predicted by the model
        _, preds = torch.max(classifier_output, dim=1)
        correct_predictions += torch.sum(preds == y_dev)
        
        #Calculating accuracy
        dev_accuracy = correct_predictions.double() / len(y_dev)
        dev_accuracies.append(dev_accuracy)

    model.train()

    print(f"Epoch: {e+1}/{EPOCHS}.. ",
          f"Training Loss: {dev_loss:.3f}.. ",
          f"Dev Loss: {dev_loss:.3f}.. ",
          f"Dev Accuracy: {dev_accuracy:.3f}")


train_model()

Epoch: 1/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 2/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 3/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 4/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 5/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 6/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 7/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 8/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 9/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 10/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 11/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 12/55..  Training Loss: 0.320..  Dev Loss: 0.320..  Dev Accuracy: 0.993
Epoch: 13/55..  Training Loss: 0.320..  Dev Loss: 0.320..  De

In [26]:
'''This function gets the predictions for each data point 
in the development and the training set'''

def get_predictions(model, x_test, y_test):

  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    model.eval()
    labels = y_test

    #Currently, not interested in the hidden layer outputs.
    _,classifier_output = model(x_test)

    #Not interested in the maximum values, interested with the indices of these max values
    _, preds = torch.max(classifier_output, dim=1)

    predictions.extend(preds)
    prediction_probs.extend(classifier_output)
    real_values.extend(labels)
  predictions = torch.stack(predictions)

  prediction_probs = torch.stack(prediction_probs)
  real_values = torch.stack(real_values)
  return  predictions, prediction_probs, real_values

In [27]:
#Getting predictions for the development set
y_pred_dev, y_pred_probs, y_true_dev = get_predictions(
  model,
  x_dev, 
  y_dev
)

In [28]:
#Printing the classifictaion report for the Development set
print(classification_report(y_true_dev, y_pred_dev ,digits =4, target_names=CLASS_NAMES))

              precision    recall  f1-score   support

        scam     0.9890    0.9968    0.9929       631
    not_scam     0.9967    0.9886    0.9926       614

    accuracy                         0.9928      1245
   macro avg     0.9929    0.9927    0.9928      1245
weighted avg     0.9928    0.9928    0.9928      1245



In [29]:
#Getting the predictions for the test set
y_pred_test, y_pred_probs, y_true_test = get_predictions(
  model,
  x_test, 
  y_test
)

In [30]:
print(classification_report(y_true_test, y_pred_test , digits = 4,  target_names=CLASS_NAMES))

              precision    recall  f1-score   support

        scam     0.9935    1.0000    0.9967       607
    not_scam     1.0000    0.9937    0.9969       638

    accuracy                         0.9968      1245
   macro avg     0.9967    0.9969    0.9968      1245
weighted avg     0.9968    0.9968    0.9968      1245



In [31]:
#Saving the model onto the drive
model_save_name = 'pre-trainedTfidf.pt'
path = F"/content/drive/MyDrive/Colab Notebooks/NLU-Project/Dataset/{model_save_name}" 
torch.save(model.state_dict(), path)

# RoBERTa

In [32]:
import tensorflow as tf
# Getting GPU device name.
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [33]:
import torch
# If a GPU is available
if torch.cuda.is_available():    
    #set device to GPU   
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If no GPU is available
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA A100-SXM4-40GB


In [34]:
#Importing necessary libraries
!pip install transformers

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy
import seaborn        as sns

import transformers
from transformers                     import  RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch



from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer

from torch                            import nn, optim
from torch.utils                      import data
from sklearn.decomposition            import PCA

#Seeding for deterministic results
RANDOM_SEED = 64
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

if torch.cuda.is_available():
   torch.cuda.manual_seed(RANDOM_SEED)
   torch.cuda.manual_seed_all(RANDOM_SEED) 
   torch.backends.cudnn.deterministic = True  
   torch.backends.cudnn.benchmark = False


CLASS_NAMES = ['scam', 'not_scam']
MAX_LENGTH = 200
BATCH_SIZE = 4
EPOCHS = 6
HIDDEN_UNITS = 128

tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-large')  #Use roberta-large or roberta-base

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [35]:
#Creates a dataset which will be used to feed to RoBERTa
class ScamDataset(data.Dataset):

  def __init__(self, firstSeq, secondSeq, TextSrcInre, labelValue,  tokenizer, max_len):
    self.firstSeq    = firstSeq      #First input sequence that will be supplied to RoBERTa
    self.secondSeq   = secondSeq     #Second input sequence that will be supplied to RoBERTa
    self.TextSrcInre = TextSrcInre   #Concatenation of subject + body text to get features from 1 training example
    self.labelValue  = labelValue    #label value for each training example in the dataset
    self.tokenizer   = tokenizer     #tokenizer that will be used to tokenize input sequences (Uses BERT-tokenizer here)
    self.max_len     = max_len       #Maximum length of the tokens from the input sequence that BERT needs to attend to

  def __len__(self):
    return len(self.labelValue)

  def __getitem__(self, item):
    firstSeq    = str(self.firstSeq[item])
    secondSeq   = str(self.secondSeq[item])
    TextSrcInre = str(self.TextSrcInre[item])
    
    #Encoding the first and the second sequence to a form accepted by RoBERTa
    #RoBERTa does not use token_type_ids to distinguish the first sequence from the second sequnece.
    encoding = tokenizer.encode_plus(
        firstSeq,
        secondSeq,
        max_length = self.max_len,
        add_special_tokens= True,
        truncation = True,
        #pad_to_max_length = True,
        #truncation=True,
        padding='max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return {
        'firstSeq' : firstSeq,
        'secondSeq' : secondSeq,
        'TextSrcInre': TextSrcInre,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labelValue'  : torch.tensor(self.labelValue[item], dtype=torch.long)
    }

In [36]:
#Creates a data loader
def createDataLoader(dataframe, tokenizer, max_len, batch_size):
  ds = ScamDataset(
      firstSeq    = dataframe.emailText.to_numpy(),
      secondSeq   = dataframe.subject.to_numpy(),
      TextSrcInre = dataframe.TextSrcInre.to_numpy(),
      labelValue  = dataframe.labelValue.to_numpy(),
      tokenizer   = tokenizer,
      max_len     = max_len
  )

  return data.DataLoader(
      ds,
      batch_size  = batch_size,
      shuffle     = True,
      num_workers = 4
  )

In [37]:
#Creating data loader for training data
trainDataLoader        = createDataLoader(trainDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for development data
developmentDataLoader  = createDataLoader(devDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

#Creating data loader for test data
testDataLoader         = createDataLoader(testDf, tokenizer, MAX_LENGTH, BATCH_SIZE)

In [None]:
testDf

In [38]:
#Instantiating the tf-idf vectorizer object
tfidf = TfidfVectorizer(min_df = 10, max_df = 0.5, ngram_range=(1,2))

xtrain = trainDf['TextSrcInre'].tolist()
x_train_feats = tfidf.fit(xtrain)
print(x_train_feats)
print(len(x_train_feats.get_feature_names_out()))


x_train_transform = x_train_feats.transform(xtrain)
tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(x_train_transform)).float()
print(x_train_transform.shape)


pca = PCA(n_components=128)
p = pca.fit(tfidf_transform_tensor)
X = p.transform(tfidf_transform_tensor)
X = torch.from_numpy(X)


TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1, 2))
21759
(3735, 21759)


In [39]:
#This class defines the model that was used to pre-train a SNN on TF-IDF features
class Tfidf_Nn(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(len(tfidf.get_feature_names_out()), HIDDEN_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_UNITS, 2)
        self.dropout = nn.Dropout(0.1)
        
        # Defining tanh activation and softmax output 
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.hidden(x)
        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)
        
        #Returning the ouputs from the hidden layer and the final output layer
        return  y, z

In [40]:
#Loading the already trained MLP model that was trained on TF-IDF features. 

from google.colab import drive
#drive.mount('/content/gdrive')
snnmodel = Tfidf_Nn()

model_save_name = 'pre-trainedTfidf.pt'
path = F"/content/drive/MyDrive/Colab Notebooks/NLU-Project/Dataset/{model_save_name}"

snnmodel.load_state_dict(torch.load(path))
snnmodel.eval()

Tfidf_Nn(
  (hidden): Linear(in_features=21759, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (tanh): Tanh()
  (softmax): Softmax(dim=1)
)

In [41]:
'''This class defines the model that will be used for 
training and testing on the dataset.'''

class ScamClassifier(nn.Module):

  def __init__(self,  n_classes):
    super(ScamClassifier, self).__init__()
    self.robertaModel              = RobertaModel.from_pretrained('roberta-large')    #use roberta-large or roberta-base
    self.model_TFIDF               = snnmodel                                        #Pre-trained SNN trained with TF-IDF features

    self.drop                      = nn.Dropout(p = 0.3)

    self.output                    = nn.Linear(self.robertaModel.config.hidden_size, n_classes)

    self.input_size_tfidf_only     = self.robertaModel.config.hidden_size + len(tfidf.get_feature_names_out())
    self.input_size_tfidf_pca      = self.robertaModel.config.hidden_size + HIDDEN_UNITS
    
    self.dense                     = nn.Linear( self.input_size_tfidf_only,  self.input_size_tfidf_only)
    self.out_proj                  = nn.Linear( self.input_size_tfidf_only, n_classes)
    self.out_pca                   = nn.Linear( self.input_size_tfidf_pca, n_classes)

    self.input_size_preTrain_tfidf = self.robertaModel.config.hidden_size +  HIDDEN_UNITS 
    self.out                       = nn.Linear(self.input_size_preTrain_tfidf, n_classes)
    
    self.softmax                   = nn.Softmax(dim = 1)

  def forward(self, input_ids, attention_mask, inputs_tfidf_feats, pca_transformed_feats, modelType):
    
    roberta_output     = self.robertaModel(
        input_ids      = input_ids,               #Input sequence tokens
        attention_mask = attention_mask )         #Mask to avoid performing attention on padding tokens
    #print(roberta_output[1].shape)

    if modelType   == 'roberta-only':
      pooled_output = roberta_output[1]           #Using pooled output
      output        = self.drop(pooled_output)
      output        = self.output(output)

    elif modelType == 'roberta-tfIdf':
      soutput = roberta_output[1]#---------        experimenting with pooled output 
      #soutput = roberta_output[0][:, 0, :]        #taking <s> token (equivalent to [CLS] token in BERT)
      x       = torch.cat((soutput, inputs_tfidf_feats) , dim=1)
      x       = self.drop(x)
      output  = self.out_proj(x)

    elif modelType == 'roberta-pcaTfidf':
      soutput = roberta_output[1]
      x       = torch.cat((soutput, pca_transformed_feats) , dim=1)
      x       = self.drop(x)
      output  = self.out_pca(x)

    elif modelType == 'roberta-TrainedTfIdf':
      tfidf_hidddenLayer, tfidf_output = self.model_TFIDF(inputs_tfidf_feats)
      #print(tfidf_hidddenLayer.shape)
      #print(tfidf_output.shape)
    
      #Conactenating pooled output from RoBERTa with the hidden layer from the pre-trained SNN using TF-IDF features. 
      #pooled_output = torch.cat((roberta_output[1], tfidf_output) , dim=1)-------- Experimenting with Output of pre-trained SNN 
      pooled_output = torch.cat((roberta_output[1], tfidf_hidddenLayer) , dim=1)
      output        = self.drop(pooled_output)
      output        = self.out(output)
    
    return self.softmax(output)

In [42]:
#Instantiating a StanceClassifier object as our model and loading the model onto the GPU.
model = ScamClassifier(len(CLASS_NAMES))
model = model.to(device)
#print(model)

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [43]:
'''Using the same optimiser as used in BERT paper
with a different learning rate'''
optimizer = AdamW(model.parameters(), 
                  lr = 2e-6, 
                  correct_bias= False)

totalSteps = len(trainDataLoader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps = totalSteps
)

weights      = [10.0, 10.0]
classWeights = torch.FloatTensor(weights)
lossFunction = nn.CrossEntropyLoss(weight = classWeights).to(device)



In [44]:
#This function is used for training the model. 
def train_epoch(
  model,
  dataLoader,
  lossFunction,
  optimizer,
  device,
  scheduler,
  n_examples
):

  model = model.train()
  losses = []
  correctPredictions = 0

  for d in dataLoader:
    
    input_ids              = d["input_ids"].to(device)                           #Loading input ids to GPU
    attention_mask         = d["attention_mask"].to(device)                      #Loading attention mask to GPU
    labelValues            = d["labelValue"].to(device)                          #Loading label value to GPU
    textSrcInre            = d["TextSrcInre"]                                    
    tfidf_transform        = x_train_feats.transform(textSrcInre)
    tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()   
    pca_tensor             = p.transform(tfidf_transform_tensor)

    pca_tensor = torch.from_numpy(pca_tensor).float()
    pca_tensor = pca_tensor.to(device)
    tfidf_transform_tensor = tfidf_transform_tensor.to(device)

    outputs = model(
      input_ids             = input_ids,
      attention_mask        = attention_mask,
      inputs_tfidf_feats    = tfidf_transform_tensor,
      pca_transformed_feats = pca_tensor,
      modelType             = 'roberta-TrainedTfIdf'
    )

    #Determining the model predictions
    _, predictionIndices = torch.max(outputs, dim=1)
    loss = lossFunction(outputs, labelValues)

    #Calculating the correct predictions for accuracy
    correctPredictions += torch.sum(predictionIndices == labelValues)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(losses), correctPredictions.double() / n_examples

In [45]:
#This function is used for evaluating the model on the development and test set
def eval_model(
    model, 
    dataLoader, 
    lossFunction,
    device,
    n_examples
    ):
  
  model = model.eval()
  losses = []
  correctPredictions = 0

  with torch.no_grad():
    for d in dataLoader:

      input_ids              = d["input_ids"].to(device)                          #Loading input ids to GPU
      attention_mask         = d["attention_mask"].to(device)                     #Loading attention mask to GPU
      labelValues            = d["labelValue"].to(device)                         #Loading label values to GPU
      textSrcInre            = d["TextSrcInre"]
      tfidf_transform        = x_train_feats.transform(textSrcInre)
      tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()    
      
      pca_tensor             = p.transform(tfidf_transform_tensor)

      pca_tensor = torch.from_numpy(pca_tensor).float()
      pca_tensor = pca_tensor.to(device)
      tfidf_transform_tensor = tfidf_transform_tensor.to(device)

      #Getting the softmax output from model for dev data
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        inputs_tfidf_feats    = tfidf_transform_tensor,
        pca_transformed_feats = pca_tensor,
        modelType             = 'roberta-TrainedTfIdf'
      )

      #Determining the model predictions
      _, predictionIndices = torch.max(outputs, dim=1)
      loss = lossFunction(outputs, labelValues)

      #Calculating the correct predictions for accuracy
      correctPredictions += torch.sum(predictionIndices == labelValues)
      losses.append(loss.item())

  return np.mean(losses), correctPredictions.double() / n_examples

In [46]:
transformers.logging.set_verbosity_error()

In [47]:
#fine tuning ROBERTa and validating it 

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}')
  trainLoss, trainAccuracy = train_epoch(
    model,
    trainDataLoader,
    lossFunction,
    optimizer,
    device,
    scheduler,
    len(trainDf)
  )
  
  print(f'Training loss {trainLoss} Training accuracy {trainAccuracy}')

  devLoss, devAccuracy = eval_model(
    model,
    developmentDataLoader,
    lossFunction,
    device,
    len(devDf)
  )

  print(f'Development loss {devLoss} Development accuracy {devAccuracy}')
  print()
  
  print()

Epoch 1
Training loss 0.3363485021688188 Training accuracy 0.9764390896921018
Development loss 0.31881646315256756 Development accuracy 0.9943775100401607


Epoch 2
Training loss 0.31587835227396555 Training accuracy 0.997322623828648
Development loss 0.3158934718141189 Development accuracy 0.9975903614457832


Epoch 3
Training loss 0.3151478708045641 Training accuracy 0.9981258366800536
Development loss 0.318482435380037 Development accuracy 0.9943775100401607


Epoch 4
Training loss 0.31537878018758025 Training accuracy 0.9978580990629183
Development loss 0.3179845693401801 Development accuracy 0.9943775100401607


Epoch 5
Training loss 0.31452790030320144 Training accuracy 0.9986613119143241
Development loss 0.31718686500038856 Development accuracy 0.995983935742972


Epoch 6
Training loss 0.3143379206129009 Training accuracy 0.9989290495314592
Development loss 0.31718933754242384 Development accuracy 0.995983935742972




In [48]:
#This function gets the predictions from the model after it is trained.
def get_predictions(model, data_loader):

  model = model.eval()
  review_texta = []
  review_textb = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      textas                 = d["firstSeq"]
      textbs                 = d["secondSeq"]
      input_ids              = d["input_ids"].to(device)
      attention_mask         = d["attention_mask"].to(device)
      labels                 = d["labelValue"].to(device)
      textSrcInre            = d["TextSrcInre"]
      tfidf_transform        = tfidf.transform(textSrcInre)
      tfidf_transform_tensor = torch.tensor(scipy.sparse.csr_matrix.todense(tfidf_transform)).float()

      pca_tensor             =  p.transform(tfidf_transform_tensor)

      pca_tensor = torch.from_numpy(pca_tensor).float()
      pca_tensor = pca_tensor.to(device)
      tfidf_transform_tensor = tfidf_transform_tensor.to(device)

      #Getting the softmax output from model
      outputs = model(
        input_ids             = input_ids,
        attention_mask        = attention_mask,
        inputs_tfidf_feats    = tfidf_transform_tensor,
        pca_transformed_feats = pca_tensor,
        modelType             = 'roberta-TrainedTfIdf'
      )

      _, preds = torch.max(outputs, dim=1)     #Determining the model predictions

      review_texta.extend(textas)
      review_textb.extend(textbs)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(labels)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  
  return review_texta, review_textb, predictions, prediction_probs, real_values

In [49]:
#Getting model predictions on dev dataset
firstSeq_dev, secondSeq_dev, yHat_dev, predProbs_dev, yTest_dev = get_predictions(
  model,
  developmentDataLoader
)

In [50]:
  #Printing classification report for dev dataset (Evaluating the model on Dev set)
print(classification_report(yTest_dev, yHat_dev, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

        scam       1.00      1.00      1.00       631
    not_scam       1.00      1.00      1.00       614

    accuracy                           1.00      1245
   macro avg       1.00      1.00      1.00      1245
weighted avg       1.00      1.00      1.00      1245



In [51]:
#Saving the model onto the drive

model_save_name = 'RoBERTaLarge_TFIDFV2.pt'
path = F"/content/drive/MyDrive/Colab Notebooks/NLU-Project/Dataset/{model_save_name}" 
torch.save(model.state_dict(), path)

In [52]:
#Getting model predictions on test dataset
firstSeq_test, secondSeq_test, yHat_test, predProbs_test, yTest_test = get_predictions(
  model,
  testDataLoader
)

In [53]:
#Printing classification report for test dataset (Evaluating the model on test set)
print(classification_report(yTest_test, yHat_test, target_names= CLASS_NAMES))

              precision    recall  f1-score   support

        scam       1.00      1.00      1.00       607
    not_scam       1.00      1.00      1.00       638

    accuracy                           1.00      1245
   macro avg       1.00      1.00      1.00      1245
weighted avg       1.00      1.00      1.00      1245



In [54]:
#Saving the predictions onto a CSV file for error analysis
zippedList =  list(zip(firstSeq_test, secondSeq_test, yHat_test, predProbs_test, yTest_test ))
dfObj = pd.DataFrame(zippedList, columns = ['Texta' , 'Textb', 'Ypred', 'YpredsProbs', 'label'])

dfObj.to_csv('/content/drive/MyDrive/Colab Notebooks/NLU-Project/Dataset/dataPredsFromRoberta_TFIDFV2.csv')
!cp dataPredsFromRoberta_TFIDFV2.csv "/content/drive/MyDrive/Colab Notebooks/NLU-Project/Dataset"

cp: cannot stat 'dataPredsFromRoberta_TFIDFV2.csv': No such file or directory
