In [0]:
import nltk

In [0]:
train_stance_url='https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_stances.csv'
train_bodies_url='https://raw.githubusercontent.com/FakeNewsChallenge/fnc-1/master/train_bodies.csv'

In [0]:
import pandas as pd
import numpy as np
stances_train = pd.read_csv(train_stance_url)
body_train = pd.read_csv(train_bodies_url)
train = pd.merge(stances_train, body_train, how='left', on='Body ID')

In [0]:
train[0:2]

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...


In [0]:
train.shape

(49972, 4)

In [0]:
dic={'agree':0,'disagree':1,'discuss':2,'unrelated':3}
train['Stance']=train['Stance'].map(dic) 

In [0]:
train[0:1]

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Police find mass graves with at least '15 bodi...,712,3,Danny Boyle is directing the untitled film\n\n...


In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
english_stemmer = nltk.stem.SnowballStemmer('english')
token_pattern = r"(?u)\b\w\w+\b"
stopwords = set(nltk.corpus.stopwords.words('english'))

In [0]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for token in tokens:
        stemmed.append(stemmer.stem(token))
    return stemmed

In [0]:
import re
def preprocess_data(line,
                    token_pattern=token_pattern,
                    exclude_stopword=True,
                    stem=True):
    token_pattern = re.compile(token_pattern, flags = re.UNICODE)
    tokens = [x.lower() for x in token_pattern.findall(line)]
    tokens_stemmed = tokens
    if stem:
#         pass
        tokens_stemmed = stem_tokens(tokens, english_stemmer)
    if exclude_stopword:
#         pass
        tokens_stemmed = [x for x in tokens_stemmed if x not in stopwords]

    return tokens_stemmed

In [0]:
preprocess_data('She is running. He ran away')
# train[0:5]

['run', 'ran', 'away']

In [0]:
# preprocess_data(train['Headline'])

Concatinating headline and body strings

In [0]:
train['combined']= train['Headline'] +" "+ train['articleBody']

In [0]:
train['combined'][0]

'Police find mass graves with at least \'15 bodies\' near Mexico town where 43 students disappeared after police clash Danny Boyle is directing the untitled film\n\nSeth Rogen is being eyed to play Apple co-founder Steve Wozniak in Sony’s Steve Jobs biopic.\n\nDanny Boyle is directing the untitled film, based on Walter Isaacson\'s book and adapted by Aaron Sorkin, which is one of the most anticipated biopics in recent years.\n\nNegotiations have not yet begun, and it’s not even clear if Rogen has an official offer, but the producers — Scott Rudin, Guymon Casady and Mark Gordon — have set their sights on the talent and are in talks.\n\nOf course, this may all be for naught as Christian Bale, the actor who is to play Jobs, is still in the midst of closing his deal. Sources say that dealmaking process is in a sensitive stage.\n\nInsiders say Boyle will is flying to Los Angeles to meet with actress to play one of the female leads, an assistant to Jobs. Insiders say that Jessica Chastain is

In [0]:
train['Headline'][0]

"Police find mass graves with at least '15 bodies' near Mexico town where 43 students disappeared after police clash"

**Preprocessing** the combined string = Stemming + Stop words removal

Generating **unigrams**= ['this , 'is' , 'a' ,'cat']

In [0]:
unigram=[]
for i in train['combined']:
  unigram.append(preprocess_data(i))

In [0]:
head_unigram=[]
for i in train['Headline']:
  head_unigram.append(preprocess_data(i))

In [0]:
body_unigram=[]
for i in train['articleBody']:
  body_unigram.append(preprocess_data(i))


In [0]:
unigram_np = np.array(unigram)
unigram_np.shape

(49972,)

**Making strings from unigrams** 

In [0]:

unigram_comb=[]
for count,i in enumerate(unigram_np):
  unigram_comb.append(' '.join(unigram_np[count]))

In [0]:
unigram_comb_head=[]
for count,i in enumerate(head_unigram):
  unigram_comb_head.append(' '.join(head_unigram[count]))

In [0]:
unigram_comb_body=[]
for count,i in enumerate(body_unigram):
  unigram_comb_body.append(' '.join(body_unigram[count]))

In [0]:
len(unigram_comb)

49972

In [0]:
len(unigram_comb[0])

749

In [0]:
len(unigram)

49972

In [0]:
train['headline_unigram']=np.array(unigram_comb_head)
train['combined_unigram']=np.array(unigram_comb)
train['body_unigram']=np.array(unigram_comb_body)

In [0]:
del(unigram)
del(head_unigram)
del(body_unigram)
del(unigram_comb)
del(unigram_comb_head)
del(unigram_comb_body)
del(stances_train)
del(body_train)
del(unigram_np)
import gc
collected= gc.collect()
print("garbage collected= ",collected)

garbage collected=  413


In [0]:
vocab_size=4000
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(ngram_range=(1,1), max_df=0.8, min_df=2,max_features=vocab_size)
vec.fit(train["combined_unigram"]) # Tf-idf calculated on the combined training + test set
vocabulary = vec.vocabulary_
del(vec)

In [0]:
vecH = TfidfVectorizer(ngram_range=(1,1), max_df=0.8, min_df=2, vocabulary=vocabulary)
xHeadlineTfidf = vecH.fit_transform(train['headline_unigram']) # use ' '.join(Headline_unigram) instead of Headline since the former is already stemmed
print ('xHeadlineTfidf.shape:')
print (xHeadlineTfidf.shape)
del(vecH)

xHeadlineTfidf.shape:
(49972, 4000)


In [0]:
len(vocabulary)

4000

In [0]:
xHeadlineTfidf[0]

<1x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [0]:
vecB = TfidfVectorizer(ngram_range=(1, 1), max_df=0.8, min_df=2, vocabulary=vocabulary,max_features=vocab_size)
xBodyTfidf = vecB.fit_transform(train['body_unigram'])
print ('xBodyTfidf.shape:')
print (xBodyTfidf.shape)
del(vecB)
del(vocabulary)

xBodyTfidf.shape:
(49972, 4000)


In [0]:
train[0:1]

In [0]:
print(xHeadlineTfidf.shape)
print(xBodyTfidf.shape)

(49972, 4000)
(49972, 4000)


In [0]:
type(xHeadlineTfidf)

In [0]:
from scipy.sparse import csr_matrix
xheadline= xHeadlineTfidf.toarray()
xbody=xBodyTfidf.toarray()
del(xHeadlineTfidf)
del(xBodyTfidf)


In [0]:
print(xheadline.shape)
print("body shape")
print(xbody.shape)

(49972, 4000)
body shape
(49972, 4000)


In [0]:
x= np.hstack((xheadline,xbody))
print(x.shape)

(49972, 8000)


In [0]:
print(train.columns)
# train['xHeadlineTfidf']=xHeadlineTfidf
# train['xBodyTfidf']=xBodyTfidf
# train['combinedTfidf']=x

Index(['Headline', 'Body ID', 'Stance', 'articleBody', 'combined',
       'headline_unigram', 'combined_unigram', 'body_unigram'],
      dtype='object')


In [0]:
y= train['Stance']
print(x.shape)
print(y.shape)
del(train)
del(xheadline)
del(xbody)

import gc
collected= gc.collect()
print("garbage collected= ",collected)

(49972, 8000)
(49972,)
garbage collected=  275


In [0]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

f=open('tfidf_x.pickle','rb')
x= pk.load(f)
f.close()

seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed)



In [0]:
import pickle as pk
f=open('tfidf_x.pickle','wb')
pk.dump(x,f)
f.close()
del(x)

In [0]:

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

# get the folder id where you want to save your file
file = drive.CreateFile()
file.SetContentFile('tfidf_x.pickle')
file.Upload()
# del(x)
import gc
collected= gc.collect()
print("garbage collected= ",collected)

garbage collected=  1137


In [0]:
!pip install pydrive

Collecting pydrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K    100% |████████████████████████████████| 993kB 27.4MB/s 
Building wheels for collected packages: pydrive
  Building wheel for pydrive (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/fa/d2/9a/d3b6b506c2da98289e5d417215ce34b696db856643bad779f4
Successfully built pydrive
Installing collected packages: pydrive
Successfully installed pydrive-1.3.1


In [0]:
# seed = 7
# test_size = 0.33
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data

model = XGBClassifier()
model.fit(X_train, y_train)
print("model fit done")
# make predictions for test data
y_pred = model.predict(X_test)
print("predictions obtained")
predictions = [round(value) for value in y_pred]
# evaluate predictions
print("calculating accuracy")
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
os.getcwd()

'/content'

In [0]:
os.chdir('drive/My Drive')

In [0]:
os.listdir()

['img482.jpg',
 'ImagePrint.pdf',
 'img478.jpg',
 'Contact Information.gform',
 'avr mazidi.pdf',
 'numpy-1.9.0-win32-superpack-python2.7.exe',
 'Screenshot_2017-07-30-22-20-16.png',
 'mnist_data_label',
 'OCR.py',
 'mnist_data',
 'To-do list.gsheet',
 'Gate',
 'IITK_resume.pdf',
 'IIT_K_modify.pdf',
 'Shivani_Tyagi.rtf',
 'machine learning- stanford',
 'kanpur resume.pdf',
 'kanpur final.pdf',
 'collage',
 'Resume',
 'offer.jpg',
 'Btech sem result',
 'Alternate prospectus aicte iiit h.pdf',
 '2018201083_3.c',
 '2018201083_1.c',
 '20182010_2.c',
 'pnb passbook.pdf',
 'resume_ey _page2.pdf',
 'resume_ey_page1.pdf',
 'resume_ey.pdf',
 'resume.pdf',
 'resume1.pdf',
 'tiwari2.3gp',
 'Colab Notebooks',
 'Mentor Intern Agreement Form',
 'ankush_nagpal.pdf',
 'ankush_nagpal - Ankush nagpal.pdf',
 'resume',
 'ankush_nagpal_new.pdf',
 'fnc',
 'tfidf_x.pickle']

In [0]:
from numpy import loadtxt
# from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle as pk
f=open('tfidf_x.pickle','rb')
x= pk.load(f)
f.close()
y=train['Stance']
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed)


In [0]:

# y=train['Stance']
del(train)
del(x)
del(f)
import gc
collected= gc.collect()
print("garbage collected= ",collected)

garbage collected=  24


In [0]:
import gc
collected= gc.collect()
print("garbage collected= ",collected)

garbage collected=  0


In [0]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
clf.fit(X_train, y_train)
from sklearn.metrics import accuracy_score

#model.fit(trainSentiment_feats, trainLabels)
#model.fit(trainSentiment_feats, trainLabels)
# make predictions for test data

val_pred = clf.predict(X_test)
y_predict = [round(value) for value in val_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# print('Score over validation set: ',fnc_score(valLabels, valPredictions))


      Iter       Train Loss   Remaining Time 
         1       34780.2667          267.90m
         2       32899.8155          265.24m
         3       31350.8851          263.34m
         4       30061.3159          263.02m
         5       28940.7437          261.59m
         6       28029.8302          260.86m
         7       27204.8798          259.17m
         8       26587.9660          257.82m
         9       26030.2315          256.70m
        10       25530.1862          254.90m
        20       22816.7311          239.94m
        30       21510.7675          225.55m
        40       20659.2709          210.09m
        50       20027.5525          195.34m
        60       19378.5485          181.37m
        70       18845.1627          167.87m
        80       18360.6994          154.54m
        90       17898.0159          141.30m
       100       17428.7941          128.19m
       200       14225.7487            0.00s
Accuracy: 84.11%


In [0]:
def score(gold_lab, test_lab):
    score = 0.0
    for (g,t) in zip(gold_lab, test_lab):
        if g == t:
            score+=0.25
            if g != 3:
                score+=0.5
        if g in [0,1,2] and t in [0,1,2]:
            score+=0.25
    
    return score

def fnc_score(actual, predicted):
    actual_score = score(actual, actual)
    calc_score = score(actual, predicted)
    return (calc_score*100)/actual_score

In [0]:
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]

def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1

    return score, cm


def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))


def report_score(actual,predicted):
    score,cm = score_submission(actual,predicted)
    best_score, _ = score_submission(actual,actual)

    print_confusion_matrix(cm)
    print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(score*100/best_score) + "%)")
    return score*100/best_score

In [0]:
print('Score over validation set: ',fnc_score(y_test, y_predict))

Score over validation set:  65.72342934141822


In [0]:
import pickle as pk
f=open('tfidf_ypredict.pickle','wb')
pk.dump(y_predict,f)
f.close()

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

# get the folder id where you want to save your file
file = drive.CreateFile()
file.SetContentFile('tfidf_ypredict.pickle')
file.Upload()
# del(x)
import gc
collected= gc.collect()
print("garbage collected= ",collected)

garbage collected=  505


In [0]:
!pip install pydrive

Collecting pydrive
[?25l  Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB)
[K    100% |████████████████████████████████| 993kB 23.9MB/s 
Building wheels for collected packages: pydrive
  Building wheel for pydrive (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/fa/d2/9a/d3b6b506c2da98289e5d417215ce34b696db856643bad779f4
Successfully built pydrive
Installing collected packages: pydrive
Successfully installed pydrive-1.3.1
