In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [5]:
doc = nlp("laptop mobile care skhduddc")

for token in doc:
    print(token.text,"| Vector:",token.has_vector,"| OOV:",token.is_oov)

laptop | Vector: True | OOV: False
mobile | Vector: True | OOV: False
care | Vector: True | OOV: False
skhduddc | Vector: False | OOV: True


In [6]:
doc[0].vector

array([ 0.41813  ,  3.3476   , -2.5517   ,  0.99622  , -1.9858   ,
        0.57456  ,  0.12014  ,  3.4147   , -3.1582   , -2.0395   ,
        2.8591   ,  3.4292   , -4.9617   ,  3.4714   ,  0.095142 ,
        0.086547 ,  2.6534   , -1.6012   ,  0.18721  ,  0.35514  ,
        0.99368  , -0.16749  ,  2.0128   , -2.7552   , -3.8319   ,
       -1.8095   , -3.021    , -4.3578   , -1.0525   ,  1.4433   ,
        0.61139  , -1.0121   , -2.0487   , -0.38338  ,  2.5742   ,
        2.203    ,  0.70816  ,  0.20291  ,  2.635    ,  2.6158   ,
        2.753    ,  0.88197  ,  1.2284   ,  1.989    ,  1.9973   ,
       -1.4213   , -0.59834  , -1.2853   ,  0.39549  , -5.8387   ,
        2.8815   , -1.0618   ,  1.0956   , -1.8544   ,  1.0944   ,
       -0.89498  ,  0.91791  ,  4.1216   , -1.9256   ,  0.30054  ,
        1.4138   ,  3.581    , -3.3205   , -2.1967   , -2.2743   ,
       -0.71967  , -1.5466   ,  0.30052  , -0.53033  ,  2.7642   ,
        0.85377  , -1.6371   ,  1.161    , -0.72424  ,  2.0824

In [7]:
doc[0].vector.shape

(300,)

In [8]:
# Lets compare words and see how similar vector they have:


In [17]:
base_token = nlp("bread")

doc = nlp("bread sandwich burger car tiger human wheat")

In [18]:
for token in doc:
    print(f"{token.text} <-> {base_token.text}:",round(token.similarity(base_token)*100,2),"%" )

bread <-> bread: 100.0 %
sandwich <-> bread: 63.41 %
burger <-> bread: 47.52 %
car <-> bread: 6.45 %
tiger <-> bread: 4.76 %
human <-> bread: 21.51 %
wheat <-> bread: 61.5 %


In [19]:
# Lets make a function of it:

In [20]:
def print_similarity(base_word , text_to_compare):
    base_token = nlp(base_word)
    doc = nlp(text_to_compare)
    for token in doc:
       print(f"{token.text} <-> {base_token.text}:",round(token.similarity(base_token)*100,2),"%" ) 

In [21]:
print_similarity("school","education is the key to success")

education <-> school: 59.91 %
is <-> school: 3.89 %
the <-> school: 24.02 %
key <-> school: 6.65 %
to <-> school: 20.51 %
success <-> school: 26.26 %


# Text Classification using word to vector(word2vec)

In [24]:
import pandas as pd

df = pd.read_csv("WELFake_Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [25]:
df.shape

(72134, 4)

In [27]:
#there is a lot of data , its wrong to loose data but the laptop will not support so lets remove some rows
df = df.head(6000)

In [28]:
df.shape

(6000, 4)

In [29]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [33]:
df.drop(columns=["Unnamed: 0","title"],inplace = True)

In [34]:
df.isnull().sum()

text     4
label    0
dtype: int64

In [35]:
df.dropna(inplace=True)

In [36]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [37]:
#lets see if data is still balanced or not

In [38]:
df["label"].value_counts()

label
1    3163
0    2833
Name: count, dtype: int64

In [39]:
# not a big difference so we can proceed

In [41]:
# now lets store the vector of text in a seperate column for each row:

In [42]:
df["vector"] = df["text"].apply(lambda x:nlp(x).vector)

In [43]:
df.head()

Unnamed: 0,text,label,vector
0,No comment is expected from Barack Obama Membe...,1,"[-1.5793014, 0.7894388, -2.5035744, 0.26397508..."
1,Did they post their votes for Hillary already?,1,"[-2.2616668, 1.367042, -3.698373, 0.49990097, ..."
2,"Now, most of the demonstrators gathered last ...",1,"[-2.576863, 0.8110364, -2.5591288, 0.40244135,..."
3,A dozen politically active pastors came here f...,0,"[-2.0481982, 0.7980047, -1.7601681, 0.11690843..."
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,"[-2.2817757, -0.15093851, -1.1325141, 0.823695..."


In [44]:
#splitting train test:

In [96]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df.vector.values,df.label,test_size=0.2,random_state=40)

In [97]:
x_train[:4]

array([array([-2.5380716e+00,  3.5843906e-01, -2.1721539e+00,  1.4499247e-01,
               4.4948459e+00,  6.3108897e-01,  4.7681481e-01,  4.4533920e+00,
               2.1116997e-01, -5.7447135e-01,  4.9453077e+00,  1.5436196e+00,
              -3.4138451e+00,  1.2017454e+00,  9.1348344e-01,  1.5230542e+00,
               4.6008009e-01, -1.0280683e+00, -1.5869446e+00, -1.9186021e+00,
               1.1801000e+00, -9.5020652e-01, -1.0111076e+00, -4.5993611e-01,
               2.5015053e-01, -1.5262598e+00, -2.9681451e+00, -4.5928067e-01,
               6.7833033e-03,  6.2573904e-01,  1.1081455e+00, -3.6136711e-01,
              -9.1467416e-01, -1.8257841e+00, -2.5450792e+00, -8.1957471e-01,
              -8.8443500e-01,  6.4205623e-01,  8.2983959e-01,  3.5554629e-02,
               3.2357943e-01,  2.8641826e-01, -3.3029708e-01,  3.4535849e-01,
              -1.3863713e+00,  5.1829332e-01,  3.9380465e-02, -2.3178363e+00,
              -8.7586397e-01,  2.3333659e+00, -8.3882964e-01,  2

In [98]:
# we can see the train and test data is array of array as it is vectorized , so we have to convert it into 2d array

In [99]:
import numpy as np

x_train_2d = np.stack(x_train)
x_train_2d

array([[-2.5380716 ,  0.35843906, -2.172154  , ..., -1.1476046 ,
        -2.3391514 ,  1.3202718 ],
       [-2.1466196 ,  0.74471235, -2.658473  , ..., -1.2888505 ,
        -2.4259644 ,  1.170604  ],
       [-1.4244448 ,  0.5045124 , -1.5333096 , ..., -1.3918691 ,
        -1.4125754 ,  0.733306  ],
       ...,
       [-2.3427768 ,  0.27507865, -2.1616166 , ..., -1.6276929 ,
        -1.8335084 ,  0.86968166],
       [-3.1234353 , -0.36974844, -1.4645665 , ..., -2.5824864 ,
        -0.66499263, -0.08303722],
       [-2.0425143 , -0.39562783, -1.6898443 , ..., -1.3160071 ,
        -1.7210765 ,  0.8515958 ]], dtype=float32)

In [100]:
# same for test:
x_test_2d = np.stack(x_test)

# scaling

In [101]:
# as the array contains negetive values we have to scale the array:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
x_train_scaled = sc.fit_transform(x_train_2d)
x_test_scaled = sc.fit_transform(x_test_2d)

# Model Building:


In [102]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_scaled,y_train)

In [103]:
# evaluation:

In [104]:
y_pred = model.predict(x_test_scaled)

In [105]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.73      0.77       553
           1       0.79      0.85      0.82       647

    accuracy                           0.80      1200
   macro avg       0.80      0.79      0.79      1200
weighted avg       0.80      0.80      0.79      1200



In [108]:
# Testing on real world news:

In [152]:
news = "pakistan became the first country to land on mars says BBC"

In [153]:
news_vector = ([np.stack(nlp(news).vector)])

prediction = model.predict(news_vector)
if prediction==1:
    print("Real")
else:
    print("Fake")

Fake


In [154]:
news2 = '''
    conspired that if  cops started losing people,  then  there will be a state of emergency. He speculated that one of two things would happen,  a big-ass [R s?????] war,  or  ni**ers, they are going to start backin  up. We are already getting killed out here so what the f**k we got to lose? Sunshine could be heard saying,  Yep, that s true. That s so f**king true. He said,  We need to turn the tables on them. Our kids are getting shot out here. Somebody needs to become a sacrifice on their side.He said,  Everybody ain t down for that s**t, or whatever, but like I say, everybody has a different position of war.  He continued,  Because they don t give a f**k anyway.  He said again,  We might as well utilized them for that s**t and turn the tables on these n**ers. He said, that way  we can start lookin  like we ain t havin  that many casualties, and there can be more causalities on their side instead of ours. They are out their killing black people, black lives don t matter, that s what those mother f**kers   so we got to make it matter to them. Find a mother f**ker that is alone. Snap his ass, and then f***in hang him from a damn tree. Take a picture of it and then send it to the mother f**kers. We  just need one example,  and  then people will start watchin .  This will turn the tables on s**t, he said. He said this will start  a trickle-down effect.  He said that when one white person is hung and then they are just  flat-hanging,  that will start the  trickle-down effect.  He continued,  Black people are good at starting trends. He said that was how  to get the upper-hand. Another black man spoke up saying they needed to kill  cops that are killing us. The first black male said,  That will be the best method right there. Breitbart Texas previously reported how Sunshine was upset when  racist white people  infiltrated and disrupted one of her conference calls. She subsequently released the phone number of one of the infiltrators. The veteran immediately started receiving threatening calls.One of the #F***YoFlag movement supporters allegedly told a veteran who infiltrated their publicly posted conference call,  We are going to rape and gut your pregnant wife, and your f***ing piece of sh*t unborn creature will be hung from a tree. Breitbart Texas previously encountered Sunshine at a Sandra Bland protest at the Waller County Jail in Texas, where she said all white people should be killed. She told journalists and photographers,  You see this nappy-ass hair on my head?   That means I am one of those more militant Negroes.  She said she was at the protest because  these redneck mother-f**kers murdered Sandra Bland because she had nappy hair like me. #FYF911 black radicals say they will be holding the  imperial powers  that are actually responsible for the terrorist attacks on September 11th accountable on that day, as reported by Breitbart Texas. There are several websites and Twitter handles for the movement. Palmetto Star  describes himself as one of the head organizers. He said in a YouTube video that supporters will be burning their symbols of  the illusion of their superiority,  their  false white supremacy,  like the American flag, the British flag, police uniforms, and Ku Klux Klan hoods.Sierra McGrone or  Nocturnus Libertus  posted,  you too can help a young Afrikan clean their a** with the rag of oppression.  She posted two photos, one that appears to be herself, and a photo of a black man, wiping their naked butts with the American flag.For entire story: Breitbart News
'''

In [158]:
news_vector = ([np.stack(nlp(news2).vector)])

prediction = model.predict(sc.transform(news_vector))
if prediction==1:
    print("Real")
else:
    print("Fake")

Real
