Text Classification Using Spacy Word Vectors

In [15]:
import pandas as pd

df = pd.read_csv('Fake_Real_Data.csv')
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [16]:
df.shape

(9900, 2)

In [17]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Fake,5000
Real,4900


In [18]:
df['label_num'] = df['label'].map({'Fake': 0, 'Real': 1})
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [19]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [20]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [21]:
doc = nlp('Top Trump Surrogate BRUTALLY Stabs Him In The')
doc.vector
doc.vector.shape

(300,)

In [22]:
df['vector'] = df['Text'].apply(lambda x : nlp(x).vector)

In [23]:
df.head()

Unnamed: 0,Text,label,label_num,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.103623025, 0.17802684, -0.11873861, -0.034..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-0.0063406364, 0.16712041, -0.06661373, 0.017..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-0.122753024, 0.17192385, -0.024732638, -0.06..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-0.027337318, 0.12501417, -0.0073965387, -0.0..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-0.032708026, 0.093958504, -0.03287002, -0.00..."


In [24]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(
    df.vector,
    df.label_num,
    test_size=0.2,
    random_state=2000
)

In [25]:
x_train.values

array([array([-7.65876994e-02,  1.77428588e-01, -4.88357656e-02, -7.22645372e-02,
               6.04429133e-02, -3.59369181e-02, -2.05820259e-02, -5.53955548e-02,
              -5.56434393e-02,  2.08915901e+00, -2.18332916e-01, -1.12974849e-02,
               8.58743861e-02, -3.98640968e-02, -9.12394673e-02, -9.84727889e-02,
              -4.07447107e-02,  1.02202189e+00, -1.24945156e-01, -5.58293797e-03,
               2.97407638e-02,  6.22152258e-03,  3.19153257e-02, -8.47521648e-02,
              -3.23614776e-02,  2.23647663e-03, -9.07551497e-02, -2.33152043e-02,
               9.21829324e-03, -4.59568202e-02, -2.27805022e-02,  6.60583302e-02,
              -2.23477110e-02,  4.55297679e-02,  8.17892775e-02, -1.60317961e-02,
              -7.08940811e-03,  4.12391946e-02, -3.30837891e-02, -6.11906461e-02,
              -2.58280300e-02,  5.28618917e-02,  6.56345859e-02, -8.06140825e-02,
               3.00921034e-02, -1.87858436e-02, -9.54923034e-02, -8.39286000e-02,
               2

In [26]:
x_train.shape

(7920,)

In [27]:
x_test.shape

(1980,)

In [28]:
x_train

Unnamed: 0,vector
3736,"[-0.0765877, 0.17742859, -0.048835766, -0.0722..."
6033,"[-0.030470975, 0.08497983, -0.076572016, -0.02..."
4208,"[-0.04189865, 0.19362208, -0.10271055, -0.0556..."
9753,"[-0.06950331, 0.11599843, -0.09796946, -0.0303..."
8689,"[-0.0718149, 0.1437282, -0.0886335, -0.0352442..."
...,...
9628,"[-0.054916013, 0.11436224, -0.1336647, -0.0420..."
4380,"[-0.03877807, 0.14102478, -0.015707511, -0.011..."
1590,"[-0.07872418, 0.153053, -0.07217284, -0.074968..."
4045,"[-0.09021759, 0.14582816, -0.13328002, -0.0104..."


In [29]:
#2D Numpy array conversion
import numpy as np
x_train_2d = np.stack(x_train)
x_test_2d = np.stack(x_test)

In [30]:
x_train_2d

array([[-0.0765877 ,  0.17742859, -0.04883577, ..., -0.06219687,
        -0.000501  ,  0.10848814],
       [-0.03047097,  0.08497983, -0.07657202, ..., -0.06150036,
        -0.01688255,  0.07575583],
       [-0.04189865,  0.19362208, -0.10271055, ..., -0.04151374,
        -0.00104723,  0.06791484],
       ...,
       [-0.07872418,  0.153053  , -0.07217284, ..., -0.02312736,
        -0.03052368,  0.03034106],
       [-0.09021759,  0.14582816, -0.13328002, ..., -0.06786332,
         0.04008128,  0.09408262],
       [-0.03163753,  0.16489455, -0.13447315, ..., -0.04724785,
         0.0320837 ,  0.02996648]], dtype=float32)

In [31]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(x_train_2d,y_train)

#Error shows -ve values, to overcome it use MINMAXSCALER

ValueError: Negative values in data passed to MultinomialNB (input X).

In [32]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_vectors = scaler.fit_transform(x_train_2d)
scaled_test_vectors = scaler.transform(x_test_2d)

clf = MultinomialNB()
clf.fit(scaled_train_vectors,y_train)


In [33]:
from sklearn.metrics import classification_report

y_pred = clf.predict(scaled_test_vectors)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94       956
           1       0.96      0.93      0.94      1024

    accuracy                           0.94      1980
   macro avg       0.94      0.94      0.94      1980
weighted avg       0.94      0.94      0.94      1980



In [34]:
from sklearn.neighbors import KNeighborsClassifier

#1. Creating a KNN Model Object

clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')

#Fit with all_train_embeddings and y_train
clf.fit(x_train_2d, y_train)

y_pred = clf.predict(scaled_test_vectors)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89       956
           1       0.90      0.89      0.90      1024

    accuracy                           0.89      1980
   macro avg       0.89      0.89      0.89      1980
weighted avg       0.89      0.89      0.89      1980

