# CharlieHebdo - Word Embeddings through GloVe and Rumour Detection

## Load csv file into pandas dataframe

---



In [0]:
from glove import Corpus, Glove

In [2]:
pip install glove_python



In [0]:
from glove import Corpus, Glove

In [0]:
corpus = Corpus() 

In [4]:
from google.colab import files
uploaded = files.upload()

Saving dump_charliehebdo.csv to dump_charliehebdo (1).csv


In [0]:
import io
import pandas as pd
df = pd.read_csv(io.BytesIO(uploaded['dump_charliehebdo.csv']), header = None)

In [52]:
df.head()

Unnamed: 0,0,1
0,Charlie Hebdo became well known for publishing...,0
1,Charlie Hebdo’s Last Tweet Before Shootings ht...,0
2,Prediction: the #CharlieHebdo massacre will no...,0
3,10:28am Charlie Hebdo account mocks ISIS leade...,0
4,If your faith isn't strong enough to cope with...,0


In [0]:
df.columns = ['text','rumour']

In [54]:
df.head()

Unnamed: 0,text,rumour
0,Charlie Hebdo became well known for publishing...,0
1,Charlie Hebdo’s Last Tweet Before Shootings ht...,0
2,Prediction: the #CharlieHebdo massacre will no...,0
3,10:28am Charlie Hebdo account mocks ISIS leade...,0
4,If your faith isn't strong enough to cope with...,0


In [0]:
#rumour_df = df.sample(frac = 1)
rumour_df = df

In [56]:
rumour_df.head()

Unnamed: 0,text,rumour
0,Charlie Hebdo became well known for publishing...,0
1,Charlie Hebdo’s Last Tweet Before Shootings ht...,0
2,Prediction: the #CharlieHebdo massacre will no...,0
3,10:28am Charlie Hebdo account mocks ISIS leade...,0
4,If your faith isn't strong enough to cope with...,0


In [57]:
rumour_df.shape

(2079, 2)

## NLP stuff

In [0]:
import re
import numpy as np
# function for cleaning data
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

In [0]:
rumour_df['clean'] = np.vectorize(remove_pattern)(rumour_df['text'], "@[\w]*")

In [60]:
rumour_df.head()

Unnamed: 0,text,rumour,clean
0,Charlie Hebdo became well known for publishing...,0,Charlie Hebdo became well known for publishing...
1,Charlie Hebdo’s Last Tweet Before Shootings ht...,0,Charlie Hebdo’s Last Tweet Before Shootings ht...
2,Prediction: the #CharlieHebdo massacre will no...,0,Prediction: the #CharlieHebdo massacre will no...
3,10:28am Charlie Hebdo account mocks ISIS leade...,0,10:28am Charlie Hebdo account mocks ISIS leade...
4,If your faith isn't strong enough to cope with...,0,If your faith isn't strong enough to cope with...


In [0]:
rumour_df['clean'] = rumour_df['clean'].str.replace("[^a-zA-Z#]", " ")

In [62]:
rumour_df.head()

Unnamed: 0,text,rumour,clean
0,Charlie Hebdo became well known for publishing...,0,Charlie Hebdo became well known for publishing...
1,Charlie Hebdo’s Last Tweet Before Shootings ht...,0,Charlie Hebdo s Last Tweet Before Shootings ht...
2,Prediction: the #CharlieHebdo massacre will no...,0,Prediction the #CharlieHebdo massacre will no...
3,10:28am Charlie Hebdo account mocks ISIS leade...,0,am Charlie Hebdo account mocks ISIS leade...
4,If your faith isn't strong enough to cope with...,0,If your faith isn t strong enough to cope with...


In [0]:
rumour_df['clean'] = rumour_df.clean.apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

In [64]:
rumour_df.head()

Unnamed: 0,text,rumour,clean
0,Charlie Hebdo became well known for publishing...,0,Charlie Hebdo became well known publishing Muh...
1,Charlie Hebdo’s Last Tweet Before Shootings ht...,0,Charlie Hebdo Last Tweet Before Shootings http...
2,Prediction: the #CharlieHebdo massacre will no...,0,Prediction #CharlieHebdo massacre will dent po...
3,10:28am Charlie Hebdo account mocks ISIS leade...,0,Charlie Hebdo account mocks ISIS leader wishin...
4,If your faith isn't strong enough to cope with...,0,your faith strong enough cope with satirical p...


In [0]:
rumour_df.clean = rumour_df.clean.apply(lambda x: x.split())

In [66]:
rumour_df.head()

Unnamed: 0,text,rumour,clean
0,Charlie Hebdo became well known for publishing...,0,"[Charlie, Hebdo, became, well, known, publishi..."
1,Charlie Hebdo’s Last Tweet Before Shootings ht...,0,"[Charlie, Hebdo, Last, Tweet, Before, Shooting..."
2,Prediction: the #CharlieHebdo massacre will no...,0,"[Prediction, #CharlieHebdo, massacre, will, de..."
3,10:28am Charlie Hebdo account mocks ISIS leade...,0,"[Charlie, Hebdo, account, mocks, ISIS, leader,..."
4,If your faith isn't strong enough to cope with...,0,"[your, faith, strong, enough, cope, with, sati..."


In [0]:
from nltk.stem.porter import * 
stemmer = PorterStemmer() 
tokenized_tweet = rumour_df.clean.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

In [68]:
tokenized_tweet.head()

0    [charli, hebdo, becam, well, known, publish, m...
1    [charli, hebdo, last, tweet, befor, shoot, htt...
2    [predict, #charliehebdo, massacr, will, dent, ...
3    [charli, hebdo, account, mock, isi, leader, wi...
4    [your, faith, strong, enough, cope, with, sati...
Name: clean, dtype: object

In [0]:
tokens = list(tokenized_tweet)

In [79]:
tokens[0][0]

'charli'

In [0]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])    
rumour_df['clean'] = tokenized_tweet

In [82]:
rumour_df.head()

Unnamed: 0,text,rumour,clean
0,Charlie Hebdo became well known for publishing...,0,charli hebdo becam well known publish muham ca...
1,Charlie Hebdo’s Last Tweet Before Shootings ht...,0,charli hebdo last tweet befor shoot http xaqoc...
2,Prediction: the #CharlieHebdo massacre will no...,0,predict #charliehebdo massacr will dent polit ...
3,10:28am Charlie Hebdo account mocks ISIS leade...,0,charli hebdo account mock isi leader wish good...
4,If your faith isn't strong enough to cope with...,0,your faith strong enough cope with satir poke ...


## GloVe Word Embeddings

In [0]:
#training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(tokens, window=10)

In [0]:
glove = Glove(no_components=100, learning_rate=0.05)

In [90]:
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [0]:
glove.add_dictionary(corpus.dictionary)

In [0]:
glove.save('glove.model')

In [93]:
glove.word_vectors[glove.dictionary['pari']]

array([ 0.21242795, -0.02948825,  0.10696074,  0.44290483,  0.28903648,
        0.13019458,  0.12635135, -0.21291367, -0.05678603,  0.14104997,
       -0.15335733, -0.13490765, -0.19913229, -0.1012081 , -0.17305592,
       -0.01014164,  0.29663231, -0.33000104, -0.00723071, -0.25015694,
       -0.03742989,  0.1293629 ,  0.0824965 ,  0.07075561,  0.02186807,
       -0.06644342,  0.26226777,  0.11176224,  0.0396616 , -0.04057066,
        0.07012145, -0.05598482, -0.12659794, -0.24373654, -0.2891988 ,
        0.45251641,  0.13755843, -0.1008381 , -0.122247  , -0.10327804,
       -0.24056082, -0.00966651,  0.43068617,  0.23568502,  0.20909835,
        0.2291351 ,  0.17406777,  0.08914236, -0.22455879,  0.37833071,
        0.07773402,  0.24050222, -0.21980524,  0.11540706,  0.21815985,
       -0.26901216,  0.10929617, -0.10756064, -0.03127759,  0.11779867,
        0.03713517,  0.33421548, -0.34315357,  0.17012823,  0.07355625,
        0.11453801,  0.2076905 ,  0.05333259,  0.46511416,  0.08

In [100]:
glove.word_vectors[glove.dictionary['pari']]

array([ 0.21242795, -0.02948825,  0.10696074,  0.44290483,  0.28903648,
        0.13019458,  0.12635135, -0.21291367, -0.05678603,  0.14104997,
       -0.15335733, -0.13490765, -0.19913229, -0.1012081 , -0.17305592,
       -0.01014164,  0.29663231, -0.33000104, -0.00723071, -0.25015694,
       -0.03742989,  0.1293629 ,  0.0824965 ,  0.07075561,  0.02186807,
       -0.06644342,  0.26226777,  0.11176224,  0.0396616 , -0.04057066,
        0.07012145, -0.05598482, -0.12659794, -0.24373654, -0.2891988 ,
        0.45251641,  0.13755843, -0.1008381 , -0.122247  , -0.10327804,
       -0.24056082, -0.00966651,  0.43068617,  0.23568502,  0.20909835,
        0.2291351 ,  0.17406777,  0.08914236, -0.22455879,  0.37833071,
        0.07773402,  0.24050222, -0.21980524,  0.11540706,  0.21815985,
       -0.26901216,  0.10929617, -0.10756064, -0.03127759,  0.11779867,
        0.03713517,  0.33421548, -0.34315357,  0.17012823,  0.07355625,
        0.11453801,  0.2076905 ,  0.05333259,  0.46511416,  0.08

In [99]:
len(glove.dictionary.keys())

4655

In [109]:
glove.dictionary['hebdo']

1

In [0]:
keys = glove.dictionary.keys()

In [122]:
glove.dictionary.values()

dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 21

In [0]:
features['charlie'] = 0

In [0]:
features = pd.DataFrame(0, index = np.arange(100), columns = keys)

In [152]:
features.head()

Unnamed: 0,charli,hebdo,becam,well,known,publish,muham,cartoon,year,last,tweet,befor,shoot,http,xaqocm,skjhneqcn,predict,#charliehebdo,massacr,will,dent,polit,class,complac,iota,account,mock,isi,leader,wish,good,health,best,uyxaykla,your,faith,strong,enough,cope,with,...,zhck,auidg,ypilwckuuw,smxqddlrx,ryegvipalu,accus,wxdk,icthapo,safe,pwmz,jzwm,ksgeeu,joel,saget,keuzi,coordin,firefight,wqrogo,paradis,jyhdgnp,umff,lvvjl,tatucfi,dbie,kfpu,jfcrkaz,awtf,sryk,familiar,cbsbhu,mond,aastvzd,#portedevincenn,#paristerrorattack,ltgmt,kwhptfi,hjnezdkavp,neutralis,zsjbjzem,kklxxg
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [154]:
features.shape

(100, 4655)

In [0]:
for key in keys:
  features[key] = (glove.word_vectors[glove.dictionary[key]])

In [157]:
features.head()

Unnamed: 0,charli,hebdo,becam,well,known,publish,muham,cartoon,year,last,tweet,befor,shoot,http,xaqocm,skjhneqcn,predict,#charliehebdo,massacr,will,dent,polit,class,complac,iota,account,mock,isi,leader,wish,good,health,best,uyxaykla,your,faith,strong,enough,cope,with,...,zhck,auidg,ypilwckuuw,smxqddlrx,ryegvipalu,accus,wxdk,icthapo,safe,pwmz,jzwm,ksgeeu,joel,saget,keuzi,coordin,firefight,wqrogo,paradis,jyhdgnp,umff,lvvjl,tatucfi,dbie,kfpu,jfcrkaz,awtf,sryk,familiar,cbsbhu,mond,aastvzd,#portedevincenn,#paristerrorattack,ltgmt,kwhptfi,hjnezdkavp,neutralis,zsjbjzem,kklxxg
0,-0.197508,-0.194583,0.001301,0.001197,-0.015552,-0.100947,-0.00073,-0.119676,-0.00129,-0.033262,-0.030635,-0.030719,-0.01865,0.068619,0.009419,0.008742,0.001366,-0.453305,-0.115315,-0.103024,0.005332,-0.016383,0.010036,0.002125,0.006234,0.003209,-0.042833,0.004355,-0.009076,0.012288,0.00746,0.003942,-0.028183,-0.000799,-0.032075,0.009025,0.010159,0.008414,0.009233,-0.125654,...,-0.004087,0.00791,0.008426,0.004352,-0.000844,0.002008,0.008272,-0.000314,0.005296,0.003125,0.009254,0.000786,0.009384,0.004556,-0.002961,0.002154,0.006314,0.004412,0.007639,0.010982,-0.000292,-0.005009,0.004488,-0.001503,0.007098,-0.00036,-0.006265,0.004034,0.001188,0.011754,-0.005634,-0.002029,-0.00146,-0.002031,-0.00077,0.001846,0.003922,-0.001202,-0.002944,0.008745
1,-0.223024,-0.212498,0.003079,0.001623,-0.005674,-0.020867,0.001409,0.008488,0.008504,-0.018652,0.011198,-0.005436,-0.031429,0.090175,0.008207,0.000664,0.003583,-0.050877,-0.033047,-0.027487,0.002189,-0.000227,0.002122,0.000374,0.003929,-0.003211,-0.012641,0.007889,-0.008093,0.006735,0.000455,0.004331,0.009162,-0.002903,0.006198,-0.003183,0.004582,0.003557,0.005866,0.044615,...,-0.00096,-0.002598,0.005035,-0.00049,-0.000256,-8.8e-05,0.002105,-0.000297,0.008531,0.004334,-0.000834,0.003879,0.001585,-0.000983,-0.004417,-0.004305,0.001429,0.000396,0.002008,0.007511,0.004522,-0.003542,0.004566,-0.004576,-0.000169,0.005739,0.004145,-0.001456,-0.004538,0.005557,0.001762,-0.00099,0.000851,-0.005014,0.004527,0.006381,-0.00135,-0.003299,-0.001554,0.000182
2,0.145794,0.153494,-0.003871,0.00809,-5.2e-05,0.08221,0.000148,0.117179,0.032719,0.031202,0.05339,0.024992,0.118359,0.194169,0.006535,-0.011,-0.003223,0.231048,0.0942,0.091922,0.000794,-2.3e-05,-0.013753,-0.007185,-0.001725,-0.004124,0.02663,-0.002028,0.01754,-0.007932,-0.012186,-0.018455,0.037899,0.003454,0.042274,-0.006663,-0.013776,-0.012773,-0.008962,0.084988,...,-0.005221,-0.00399,-0.011073,-0.001268,-0.003794,-0.011387,-0.002945,-0.001467,-0.007417,-0.007655,-0.003758,-0.006489,-0.016061,-0.009844,-0.002885,-0.00281,-0.005695,-0.013575,-0.001871,-0.008346,-0.01364,-0.006254,-0.009183,-0.008264,0.000862,-0.007766,-0.005403,-0.011505,-0.004788,-0.004431,-0.004684,-0.009338,-0.000887,-0.006738,-0.003358,-0.003499,-0.00351,-0.003007,-0.008598,-0.01109
3,0.322465,0.329779,-0.021418,0.003077,-0.014404,0.132918,-0.007269,0.211319,0.076676,0.041789,0.083314,0.050014,0.294482,0.432406,0.008306,-0.012449,-0.011701,0.347906,0.17359,0.141102,-0.010647,0.005781,-0.031404,-0.00731,-0.010363,-0.000915,0.044059,-0.007321,0.039604,-0.032525,-0.026377,-0.03411,0.071252,0.002135,0.082455,-0.022058,-0.028423,-0.034067,-0.004349,0.210268,...,-0.023657,-0.018953,-0.030416,-0.002959,-0.007461,-0.020302,-0.017979,-0.010545,0.000122,-0.014805,-0.010744,-0.010197,-0.031572,-0.025964,-0.015701,-0.020028,-0.015865,-0.02386,-0.014046,-0.034813,-0.035493,-0.02262,-0.01991,-0.023497,-0.000131,-0.003846,-0.019967,-0.024151,-0.024789,-0.019044,-0.025722,-0.029234,-0.020569,-0.025918,-0.020156,-0.006273,-0.017744,-0.022354,-0.021367,-0.028619
4,0.022775,0.039611,-0.008026,-0.001075,-0.013713,-0.024259,-0.001469,-0.010599,0.014532,0.002225,0.006668,-0.006352,0.090166,0.205661,0.001378,-0.003805,-0.004376,-0.215761,-0.007935,-0.021615,-0.002844,-0.002102,-0.007521,-0.004876,-0.003861,0.005343,-0.001643,-0.000617,0.003714,-0.000354,-0.001117,-0.006329,0.010901,0.004544,0.01414,0.000654,-0.004676,-0.003382,-0.003294,-0.034679,...,-0.004363,-0.001634,-0.010711,-0.004054,0.001071,-0.006829,-0.003825,0.002377,-0.000451,-0.003399,-0.003976,-0.001691,-0.009728,-0.003778,-0.000382,-0.003269,-0.007276,-0.00201,-0.003083,-0.005234,-0.006337,-0.005393,-0.005265,-0.006619,0.00555,-0.005779,-0.004182,-0.009701,-0.003605,-0.001767,-0.008777,-0.013725,-0.006835,-0.005135,-0.005382,-0.003505,-0.000617,-0.008359,-0.006382,-0.005406


In [158]:
features.shape

(100, 4655)

In [0]:
features_sum = features.sum(axis = 0)

In [172]:
features_sum.head()

charli    0.730782
hebdo     0.844173
becam    -0.099315
well      0.073414
known    -0.095276
dtype: float64

In [0]:
features_final = pd.DataFrame(0, index = np.arange(100), columns = keys)

In [216]:
features_final.shape

(100, 4655)

In [217]:
len(features_final)

100

In [226]:
sentences = 0
counter = 0
for sentence in tokens:    
  for word in sentence:
    for column in features_final.columns:
      if word == column:
        features_final.loc[counter, word] = features_sum[word]
  counter = counter + 1
  print("counter ", counter)

counter  1
counter  2
counter  3
counter  4
counter  5
counter  6
counter  7
counter  8
counter  9
counter  10
counter  11
counter  12
counter  13
counter  14
counter  15
counter  16
counter  17
counter  18
counter  19
counter  20
counter  21
counter  22
counter  23
counter  24
counter  25
counter  26
counter  27
counter  28
counter  29
counter  30
counter  31
counter  32
counter  33
counter  34
counter  35
counter  36
counter  37
counter  38
counter  39
counter  40
counter  41
counter  42
counter  43
counter  44
counter  45
counter  46
counter  47
counter  48
counter  49
counter  50
counter  51
counter  52
counter  53
counter  54
counter  55
counter  56
counter  57
counter  58
counter  59
counter  60
counter  61
counter  62
counter  63
counter  64
counter  65
counter  66
counter  67
counter  68
counter  69
counter  70
counter  71
counter  72
counter  73
counter  74
counter  75
counter  76
counter  77
counter  78
counter  79
counter  80
counter  81
counter  82
counter  83
counter  84
c

In [227]:
len(features_final)

2079

In [228]:
features_final.head()

Unnamed: 0,charli,hebdo,becam,well,known,publish,muham,cartoon,year,last,tweet,befor,shoot,http,xaqocm,skjhneqcn,predict,#charliehebdo,massacr,will,dent,polit,class,complac,iota,account,mock,isi,leader,wish,good,health,best,uyxaykla,your,faith,strong,enough,cope,with,...,zhck,auidg,ypilwckuuw,smxqddlrx,ryegvipalu,accus,wxdk,icthapo,safe,pwmz,jzwm,ksgeeu,joel,saget,keuzi,coordin,firefight,wqrogo,paradis,jyhdgnp,umff,lvvjl,tatucfi,dbie,kfpu,jfcrkaz,awtf,sryk,familiar,cbsbhu,mond,aastvzd,#portedevincenn,#paristerrorattack,ltgmt,kwhptfi,hjnezdkavp,neutralis,zsjbjzem,kklxxg
0,0.730782,0.844173,-0.099315,0.073414,-0.095276,0.271755,-0.022437,0.691775,0.398086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.730782,0.844173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0786,0.452297,0.191105,1.643838,2.710899,0.105434,-0.067735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.077063,0.306857,0.587938,0.352961,-0.056401,-0.023326,-0.144163,-0.048807,-0.054251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.730782,0.844173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.710899,0.105434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030594,0.077454,-0.021169,0.207692,-0.046356,-0.13916,-0.167462,0.333534,-0.006999,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43404,-0.095769,-0.104588,-0.130119,-0.055831,0.627443,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
features_final['labels'] = rumour_df.rumour

In [232]:
features_final.head()

Unnamed: 0,charli,hebdo,becam,well,known,publish,muham,cartoon,year,last,tweet,befor,shoot,http,xaqocm,skjhneqcn,predict,#charliehebdo,massacr,will,dent,polit,class,complac,iota,account,mock,isi,leader,wish,good,health,best,uyxaykla,your,faith,strong,enough,cope,with,...,auidg,ypilwckuuw,smxqddlrx,ryegvipalu,accus,wxdk,icthapo,safe,pwmz,jzwm,ksgeeu,joel,saget,keuzi,coordin,firefight,wqrogo,paradis,jyhdgnp,umff,lvvjl,tatucfi,dbie,kfpu,jfcrkaz,awtf,sryk,familiar,cbsbhu,mond,aastvzd,#portedevincenn,#paristerrorattack,ltgmt,kwhptfi,hjnezdkavp,neutralis,zsjbjzem,kklxxg,labels
0,0.730782,0.844173,-0.099315,0.073414,-0.095276,0.271755,-0.022437,0.691775,0.398086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.730782,0.844173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0786,0.452297,0.191105,1.643838,2.710899,0.105434,-0.067735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.077063,0.306857,0.587938,0.352961,-0.056401,-0.023326,-0.144163,-0.048807,-0.054251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.730782,0.844173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.710899,0.105434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030594,0.077454,-0.021169,0.207692,-0.046356,-0.13916,-0.167462,0.333534,-0.006999,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43404,-0.095769,-0.104588,-0.130119,-0.055831,0.627443,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Split Data into training and test

In [0]:
features_final = features_final.sample(frac = 1)

In [235]:
features_final.head()

Unnamed: 0,charli,hebdo,becam,well,known,publish,muham,cartoon,year,last,tweet,befor,shoot,http,xaqocm,skjhneqcn,predict,#charliehebdo,massacr,will,dent,polit,class,complac,iota,account,mock,isi,leader,wish,good,health,best,uyxaykla,your,faith,strong,enough,cope,with,...,auidg,ypilwckuuw,smxqddlrx,ryegvipalu,accus,wxdk,icthapo,safe,pwmz,jzwm,ksgeeu,joel,saget,keuzi,coordin,firefight,wqrogo,paradis,jyhdgnp,umff,lvvjl,tatucfi,dbie,kfpu,jfcrkaz,awtf,sryk,familiar,cbsbhu,mond,aastvzd,#portedevincenn,#paristerrorattack,ltgmt,kwhptfi,hjnezdkavp,neutralis,zsjbjzem,kklxxg,labels
143,,,,,,,,,,,,,,,,,,0.306857,0.587938,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
1431,,,,,,,,,,,,,,2.710899,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
878,0.730782,0.844173,,,,,,,,,,,,,,,,,,0.352961,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
2004,,,,,,,,,,,,,,2.710899,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
1320,,,,,,,,,,,,,,2.710899,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0


In [0]:
features_final = features_final.fillna(0)

In [239]:
features_final.head()

Unnamed: 0,charli,hebdo,becam,well,known,publish,muham,cartoon,year,last,tweet,befor,shoot,http,xaqocm,skjhneqcn,predict,#charliehebdo,massacr,will,dent,polit,class,complac,iota,account,mock,isi,leader,wish,good,health,best,uyxaykla,your,faith,strong,enough,cope,with,...,auidg,ypilwckuuw,smxqddlrx,ryegvipalu,accus,wxdk,icthapo,safe,pwmz,jzwm,ksgeeu,joel,saget,keuzi,coordin,firefight,wqrogo,paradis,jyhdgnp,umff,lvvjl,tatucfi,dbie,kfpu,jfcrkaz,awtf,sryk,familiar,cbsbhu,mond,aastvzd,#portedevincenn,#paristerrorattack,ltgmt,kwhptfi,hjnezdkavp,neutralis,zsjbjzem,kklxxg,labels
143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306857,0.587938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.710899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
878,0.730782,0.844173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.710899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.710899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [0]:
X = features_final.iloc[:,:-1]

In [0]:
y = features_final.iloc[:,-1]

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [244]:
y_train.head()

854     0
1454    0
1856    1
421     0
940     0
Name: labels, dtype: int64

In [0]:
## Classifier - Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression

In [247]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)



In [0]:
y_pred = clf.predict(X_test)

In [0]:
## Accuracy

In [250]:
clf.score(X_test, y_test)

0.8533653846153846

In [0]:
## Evaluating Results

In [252]:
from sklearn.metrics import precision_recall_fscore_support
recall_logistic = precision_recall_fscore_support(y_test, y_pred)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix
confusion_mat = pd.DataFrame(confusion_matrix(y_test, y_pred))

print('Confusion matrix \n',confusion_mat)

              precision    recall  f1-score   support

           0       0.90      0.91      0.91       332
           1       0.64      0.62      0.63        84

    accuracy                           0.85       416
   macro avg       0.77      0.77      0.77       416
weighted avg       0.85      0.85      0.85       416

Confusion matrix 
      0   1
0  303  29
1   32  52
