In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    print('Failed')
    # Invalid device or cannot modify virtual devices once initialized.
    pass

In [2]:
import re
import pickle
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec

In [3]:
train_data = pd.read_csv('fake-news/train.csv')
test_data = pd.read_csv('fake-news/test.csv')

In [4]:
data = pd.concat([train_data,test_data])

In [5]:
data.reset_index(inplace = True)

In [6]:
data.drop('index',axis = 1,inplace = True)

In [7]:
data

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0.0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1.0
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1.0
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1.0
...,...,...,...,...,...
25995,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,
25996,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,
25997,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,
25998,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,


In [8]:
data['combined'] = data['title'] + ' ' + data['text']

In [9]:
data

Unnamed: 0,id,title,author,text,label,combined
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0.0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1.0,Why the Truth Might Get You Fired Why the Trut...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1.0,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1.0,Iranian woman jailed for fictional unpublished...
...,...,...,...,...,...,...
25995,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,,The Bangladeshi Traffic Jam That Never Ends - ...
25996,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,,John Kasich Signs One Abortion Bill in Ohio bu...
25997,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,,"California Today: What, Exactly, Is in Your Su..."
25998,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,,300 US Marines To Be Deployed To Russian Borde...


In [10]:
data.columns

Index(['id', 'title', 'author', 'text', 'label', 'combined'], dtype='object')

In [11]:
data = data[['id', 'title', 'author', 'text', 'combined', 'label']]

In [12]:
data

Unnamed: 0,id,title,author,text,combined,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0.0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Why the Trut...,1.0
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...,1.0
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...,1.0
...,...,...,...,...,...,...
25995,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,The Bangladeshi Traffic Jam That Never Ends - ...,
25996,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,John Kasich Signs One Abortion Bill in Ohio bu...,
25997,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,"California Today: What, Exactly, Is in Your Su...",
25998,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,300 US Marines To Be Deployed To Russian Borde...,


In [13]:
data['combined'].fillna('This text field is empty',inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [14]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    corpus = []
    for i in range(len(text)):
        review = re.sub('[^a-zA-Z]',' ',text[i])
        review = review.lower()
        review = review.split()
        review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english') and len(word)>2]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [15]:
data['combined'].isnull().sum()

0

In [16]:
filtered_data = lemmatization(data['combined'])

In [17]:
filtered_data[0]

'house dem aide even see comey letter jason chaffetz tweeted house dem aide even see comey letter jason chaffetz tweeted darrell lucus october subscribe jason chaffetz stump american fork utah image courtesy michael jolley available creative common license apology keith olbermann doubt worst person world week fbi director james comey according house democratic aide look like also know second worst person well turn comey sent infamous letter announcing fbi looking email may related hillary clinton email server ranking democrat relevant committee hear comey found via tweet one republican committee chairman know comey notified republican chairman democratic ranking member house intelligence judiciary oversight committee agency reviewing email recently discovered order see contained classified information long letter went oversight committee chairman jason chaffetz set political world ablaze tweet fbi dir informed fbi learned existence email appear pertinent investigation case reopened jas

In [18]:
# with open('filtered_data.pkl','wb') as f:
#     pickle.dump(filtered_data,f)

In [14]:
filtered_data = pickle.load( open('filtered_data.pkl', 'rb'))

In [19]:
# tokenizer = Tokenizer(lower = False)
# tokenizer.fit_on_texts(filtered_data)

In [20]:
# with open('tokenizer.pkl','wb') as f:
#     pickle.dump(tokenizer,f)

In [17]:
tokenizer = pickle.load(open('tokenize.pkl','rb'))

In [18]:
content = tokenizer.texts_to_sequences(filtered_data)

In [19]:
cont = np.array([len(x) for x in content])
cont_len = len(cont[cont<900])
print("content : {}, maxlen : {}".format(len(cont),cont_len))

content : 26000, maxlen : 24255


In [20]:
max_len = 900

In [21]:
content_index = tokenizer.word_index

In [22]:
data

Unnamed: 0,id,title,author,text,combined,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0.0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Why the Trut...,1.0
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...,1.0
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...,1.0
...,...,...,...,...,...,...
25995,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,The Bangladeshi Traffic Jam That Never Ends - ...,
25996,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,John Kasich Signs One Abortion Bill in Ohio bu...,
25997,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,"California Today: What, Exactly, Is in Your Su...",
25998,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,300 US Marines To Be Deployed To Russian Borde...,


In [23]:
data['label'].isnull().sum()

5200

In [24]:
data[data['label'] == np.nan]

Unnamed: 0,id,title,author,text,combined,label


In [25]:
data['label'].isnull().sum()

5200

In [26]:
data

Unnamed: 0,id,title,author,text,combined,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0.0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Why the Trut...,1.0
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...,1.0
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...,1.0
...,...,...,...,...,...,...
25995,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,The Bangladeshi Traffic Jam That Never Ends - ...,
25996,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,John Kasich Signs One Abortion Bill in Ohio bu...,
25997,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,"California Today: What, Exactly, Is in Your Su...",
25998,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,300 US Marines To Be Deployed To Russian Borde...,


In [27]:
data['label'].fillna(9,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [28]:
data

Unnamed: 0,id,title,author,text,combined,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0.0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Why the Trut...,1.0
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...,1.0
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...,1.0
...,...,...,...,...,...,...
25995,25995,The Bangladeshi Traffic Jam That Never Ends - ...,Jody Rosen,Of all the dysfunctions that plague the world’...,The Bangladeshi Traffic Jam That Never Ends - ...,9.0
25996,25996,John Kasich Signs One Abortion Bill in Ohio bu...,Sheryl Gay Stolberg,WASHINGTON — Gov. John Kasich of Ohio on Tu...,John Kasich Signs One Abortion Bill in Ohio bu...,9.0
25997,25997,"California Today: What, Exactly, Is in Your Su...",Mike McPhate,Good morning. (Want to get California Today by...,"California Today: What, Exactly, Is in Your Su...",9.0
25998,25998,300 US Marines To Be Deployed To Russian Borde...,,« Previous - Next » 300 US Marines To Be Deplo...,300 US Marines To Be Deployed To Russian Borde...,9.0


In [29]:
train_len = len(data[data['label'] != 9])

In [30]:
train = data[:train_len]
test = data[train_len:]

In [31]:
train

Unnamed: 0,id,title,author,text,combined,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1.0
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0.0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Why the Trut...,1.0
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...,1.0
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...,1.0
...,...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,Rapper T.I.: Trump a ’Poster Child For White S...,0.0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",0.0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,Macy’s Is Said to Receive Takeover Approach by...,0.0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",1.0


In [32]:
train['label'] = train['label'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['label'] = train['label'].astype(int)


In [33]:
train

Unnamed: 0,id,title,author,text,combined,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Why the Truth Might Get You Fired Why the Trut...,1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,15 Civilians Killed In Single US Airstrike Hav...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Iranian woman jailed for fictional unpublished...,1
...,...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,Rapper T.I.: Trump a ’Poster Child For White S...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,Macy’s Is Said to Receive Takeover Approach by...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",1


In [34]:
train_tok = content[:train_len]
test_tok = content[train_len:]

In [35]:
train_seq = pad_sequences(train_tok,padding = 'post',truncating = 'post',maxlen = max_len)
test_seq = pad_sequences(test_tok,padding = 'post', truncating = 'post', maxlen = max_len)

In [36]:
len(train_seq[0])

900

### Word2Vec

In [37]:
contentVec = [nltk.word_tokenize(word) for word in filtered_data]

In [38]:
contentVec

[['house',
  'dem',
  'aide',
  'even',
  'see',
  'comey',
  'letter',
  'jason',
  'chaffetz',
  'tweeted',
  'house',
  'dem',
  'aide',
  'even',
  'see',
  'comey',
  'letter',
  'jason',
  'chaffetz',
  'tweeted',
  'darrell',
  'lucus',
  'october',
  'subscribe',
  'jason',
  'chaffetz',
  'stump',
  'american',
  'fork',
  'utah',
  'image',
  'courtesy',
  'michael',
  'jolley',
  'available',
  'creative',
  'common',
  'license',
  'apology',
  'keith',
  'olbermann',
  'doubt',
  'worst',
  'person',
  'world',
  'week',
  'fbi',
  'director',
  'james',
  'comey',
  'according',
  'house',
  'democratic',
  'aide',
  'look',
  'like',
  'also',
  'know',
  'second',
  'worst',
  'person',
  'well',
  'turn',
  'comey',
  'sent',
  'infamous',
  'letter',
  'announcing',
  'fbi',
  'looking',
  'email',
  'may',
  'related',
  'hillary',
  'clinton',
  'email',
  'server',
  'ranking',
  'democrat',
  'relevant',
  'committee',
  'hear',
  'comey',
  'found',
  'via',
  't

In [40]:
embed_dim = 100

In [41]:
w2v_content = Word2Vec(contentVec, min_count = 1, size = embed_dim)

In [68]:
w2v_content.wv.most_similar('money')

[('cash', 0.7237710952758789),
 ('fund', 0.6511778235435486),
 ('dollar', 0.621246337890625),
 ('taxpayer', 0.6039408445358276),
 ('donation', 0.5928052663803101),
 ('payment', 0.5879735350608826),
 ('loan', 0.5847669839859009),
 ('debt', 0.5820124745368958),
 ('sum', 0.5710011124610901),
 ('paycheck', 0.5674009323120117)]

In [70]:
len(w2v_content.wv.vocab)

152101

In [77]:
def get_embedding_weights(model,vocabulary):
    voc_size = len(vocabulary) + 1
    weight_matrix = np.zeros((voc_size,embed_dim))
    for word,i in vocabulary.items():
        try:
            weight_matrix[i,:] = model.wv[word]
        except KeyError:
            pass
    return weight_matrix

In [82]:
embedding_weights = get_embedding_weights(w2v_content,content_index)

In [83]:
vocabulary_size = len(content_index) + 1

In [90]:
X = np.array(train_seq)
y = np.array(train['label'])

In [91]:
X.shape

(20800, 900)

In [92]:
y.shape

(20800,)

In [85]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabulary_size,output_dim = embed_dim, weights = [embedding_weights],input_length = max_len, trainable = False),
    tf.keras.layers.LSTM(units = 128),
    tf.keras.layers.Dense(1,activation = 'sigmoid', kernel_regularizer = tf.keras.regularizers.l2(0.01))
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [86]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 900, 100)          15210700  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 15,328,077
Trainable params: 117,377
Non-trainable params: 15,210,700
_________________________________________________________________


In [93]:
model.fit(X,y,epochs = 15, validation_split = 0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fad18651610>

In [94]:
model.save('news_model.h5')

In [95]:
X_test = np.array(test_seq)

In [98]:
y_pred = model.predict(X_test)

In [99]:
def binarize(arr):
    for i,value in enumerate(arr):
        if value > 0.5:
            arr[i] = 1
        else:
            arr[i] = 0
    arr.reshape(-1)
    a = []
    for e in arr:
        a.append(e.astype(int))
    return a

In [100]:
y_pred = binarize(y_pred)

In [106]:
for i in range(len(y_pred)):
    y_pred[i] = y_pred[i][0]

In [107]:
df = pd.DataFrame({'id' : test['id'],'label' : y_pred})

In [108]:
df

Unnamed: 0,id,label
20800,20800,0
20801,20801,1
20802,20802,1
20803,20803,0
20804,20804,1
...,...,...
25995,25995,0
25996,25996,0
25997,25997,0
25998,25998,1


In [109]:
df.to_csv('submit.csv', index = False)

In [42]:
test['combined'].iloc[780]

'Cyrus Mistry speaks out, says TATA removal didn’t hurt as much as punny headlines on it Cyrus Mistry speaks out, says TATA removal didn’t hurt as much as punny headlines on it Posted on Tweet \nOusted TATA group chairman Cyrus Mistry finally spoke out regarding the development, terming that while getting the boot did hurt , it was nowhere in comparison to what he felt after reading dozens of pun-filled headlines over his name and ouster. \n“What the hell, yaar ? Whatever I pick up, all I read is: Mystery over Mistry’s ouster; Che-Mistry no more, TATA group sacks chairman; Mist yet to clear on why Mistry was ousted, TATA says tata to Mistry. I can’t take this anymore! Please stop. I can always find another job or even become a chairman of some other company if lady luck favors me, but will these Mistry puns ever end? I’ve had enough of this pun-ishment!” the ex-chairman cried out to The UnReal Times . The businessman was soon supported by Sixth Sense scientist Pranav Mistry, who tweete