<a href="https://colab.research.google.com/github/saddarudin/google_colab/blob/main/nlp_word_vectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
# Word Vectors accupy lot of space. Hence en_core_web_sm model do not have them included.
# So we will download large or medium model in English
!python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
doc = nlp('dog cat banana saddarp')

for token in doc:
  print(token.text, 'Vector:', token.has_vector, "OOV:", token.is_oov)

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
saddarp Vector: False OOV: True


In [3]:
doc[0].vector.shape

(300,)

In [4]:
base_token = nlp('bread')
base_token.vector.shape

(300,)

In [5]:
doc = nlp('sandwich burger car tiger human wheat')

for token in doc:
  print(f"{token.text} <-> {base_token.text}: ",token.similarity(base_token))

sandwich <-> bread:  0.6874560117721558
burger <-> bread:  0.544037401676178
car <-> bread:  0.16441145539283752
tiger <-> bread:  0.14492353796958923
human <-> bread:  0.21103660762310028
wheat <-> bread:  0.6572456359863281


In [6]:
def get_similarity(base_word, words_to_compare):
  base_token = nlp(base_word)
  doc = nlp(words_to_compare)
  for token in doc:
    print(f"{token.text} <-> {base_token.text}", token.similarity(base_token))

In [7]:
get_similarity("iphone","apple samsung iphone dog tiger")

apple <-> iphone 0.6339781284332275
samsung <-> iphone 0.6678677797317505
iphone <-> iphone 0.9999999403953552
dog <-> iphone 0.1743103712797165
tiger <-> iphone 0.22750601172447205


In [8]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result],[queen])

array([[0.7880844]], dtype=float32)

## News Classification using Word Vectors

In [10]:
import pandas as pd

df = pd.read_csv('Fake_Real_Data.csv')
print(df.shape)
df.head()

(9900, 2)


Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [11]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Fake,5000
Real,4900


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB


In [13]:
df['is_real'] = df.label.apply(lambda x: 1 if x == 'Real' else 0)
df.head()

Unnamed: 0,Text,label,is_real
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [14]:
import spacy

nlp = spacy.load('en_core_web_lg')

doc = nlp('Top Trump Surrogate BRUTALLY Stabs')
doc.vector.shape

(300,)

In [15]:
df['vector'] = df.Text.apply(lambda x: nlp(x).vector)
df.shape

(9900, 4)

In [16]:
df.head()

Unnamed: 0,Text,label,is_real,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.103623025, 0.17802684, -0.11873861, -0.034..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-0.0063406364, 0.16712041, -0.06661373, 0.017..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-0.122753024, 0.17192385, -0.024732638, -0.06..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-0.027337318, 0.12501417, -0.0073965387, -0.0..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-0.032708026, 0.093958504, -0.03287002, -0.00..."


In [31]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df.vector.values,df.is_real,test_size=0.2,random_state=2022,stratify=df.is_real)
x_train.shape,x_test.shape

((7920,), (1980,))

In [20]:
x_train[:5]

array([array([-4.11531478e-02,  1.95886418e-01, -5.17092720e-02, -6.36897907e-02,
               2.30457615e-02,  3.17761563e-02, -9.37498268e-03, -5.65892719e-02,
              -2.26258710e-02,  2.10209370e+00, -1.86347514e-01,  6.81234822e-02,
               2.77848691e-02, -7.88347498e-02, -1.29845247e-01, -7.90607184e-02,
              -1.30447268e-01,  8.41746509e-01, -1.50108680e-01, -3.83838899e-02,
               5.12799360e-02, -3.08340546e-02, -3.10126673e-02, -2.61848886e-02,
               2.58135237e-02, -2.44493037e-02, -6.20129220e-02, -5.38262650e-02,
              -3.33169736e-02, -1.00060478e-01, -3.79262790e-02,  8.39029625e-02,
              -3.70464996e-02,  7.99085945e-02,  1.09126031e-01, -7.98731819e-02,
              -4.30494808e-02,  8.72264430e-03, -6.04222976e-02, -9.00040288e-03,
               3.18366736e-02,  2.56469827e-02,  2.20669042e-02, -9.74397138e-02,
               4.74717952e-02,  7.52550811e-02, -1.54887304e-01, -1.07479738e-02,
               5

## You can see there is an array and it has arrays inside so we will convert it into 2d array

In [21]:
import numpy as np

x_train_2d = np.stack(x_train)
x_test_2d = np.stack(x_test)

In [22]:
x_train_2d

array([[-0.04115315,  0.19588642, -0.05170927, ..., -0.03505493,
         0.01995586,  0.09486677],
       [-0.02449099,  0.16324393, -0.14633366, ..., -0.06739594,
         0.02454064,  0.12155674],
       [-0.06639803,  0.16070557, -0.12050964, ..., -0.06479696,
         0.04207559,  0.0515073 ],
       ...,
       [-0.02249848,  0.11863155, -0.04971174, ..., -0.02883008,
        -0.00270765,  0.08681635],
       [-0.07665006,  0.16303538, -0.06732539, ..., -0.07292987,
         0.05398569,  0.07514985],
       [-0.03592037,  0.18351656,  0.03137523, ..., -0.03847933,
         0.01902771,  0.09889441]], dtype=float32)

## Since there are negative values and MultinomialNB does not accept negative values so we can use min_max_scaler to scale the values

In [23]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_2d = scaler.fit_transform(x_train_2d)
x_test_2d = scaler.transform(x_test_2d)

x_train_2d[:5]

array([[0.59762204, 0.6636677 , 0.37445432, ..., 0.68241256, 0.61148655,
        0.580585  ],
       [0.65553033, 0.5305733 , 0.13645762, ..., 0.5653125 , 0.6278052 ,
        0.6697886 ],
       [0.5098851 , 0.52022356, 0.20140952, ..., 0.5747228 , 0.6902174 ,
        0.4356683 ],
       [0.5643139 , 0.3488251 , 0.4368839 , ..., 0.62658596, 0.625759  ,
        0.5188637 ],
       [0.56687915, 0.5117494 , 0.3420871 , ..., 0.7731495 , 0.44057328,
        0.4400471 ]], dtype=float32)

In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_2d,y_train)

In [25]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test_2d)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1000
           1       0.95      0.93      0.94       980

    accuracy                           0.94      1980
   macro avg       0.94      0.94      0.94      1980
weighted avg       0.94      0.94      0.94      1980



In [29]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
model.fit(x_train_2d,y_train)

In [30]:
y_pred = model.predict(x_test_2d)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1000
           1       0.96      0.99      0.98       980

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

