In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import spacy
import pickle

In [2]:
df=pd.read_csv('complete_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Text,Category
0,This is ridiculous,1
1,or take them with us,1
2,Use ground attacks with the army... Higher cos...,1
3,"They are a part of our coalition, send troops ...",1
4,find another solution,1


In [4]:
df.shape

(65052, 2)

In [5]:
df.Category.value_counts()

0    32538
1    32514
Name: Category, dtype: int64

In [6]:
nlp=spacy.load("en_core_web_lg")

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [8]:
df['vector']=df['Text'].apply(lambda x: nlp(x).vector)

In [9]:
pickle.dump(nlp, open('nlp.pkl', 'wb'))

In [10]:
loaded_nlp=pickle.load(open('nlp.pkl','rb'))

In [11]:
type(df['Text'])

pandas.core.series.Series

In [12]:
type(df['vector'])

pandas.core.series.Series

In [13]:
df.head()

Unnamed: 0,Text,Category,vector
0,This is ridiculous,1,"[1.12264, 2.5018601, -0.21636336, -3.7151334, ..."
1,or take them with us,1,"[-1.2643799, 0.7410859, -7.6058884, 0.41779795..."
2,Use ground attacks with the army... Higher cos...,1,"[-1.3343307, -0.05363458, -2.6162407, 1.452974..."
3,"They are a part of our coalition, send troops ...",1,"[-3.3885992, 0.2531046, -3.6640375, 1.9524308,..."
4,find another solution,1,"[1.1914134, 1.7413999, -2.0207133, 0.7276667, ..."


In [14]:
X_train, X_test, y_train, y_test=train_test_split(
df.vector.values,
df.Category,
test_size=0.2,
random_state=2022)

In [32]:
X_train

array([array([-1.2168788e+00, -8.6045992e-01, -6.8528599e-01, -6.6909999e-01,
               4.9404197e+00,  4.4304007e-01,  3.8795599e-01,  5.8449399e-01,
              -1.8247080e+00, -7.3320615e-01,  6.9135199e+00, -1.3738081e+00,
              -3.1837797e+00,  2.0647779e+00,  1.0709560e+00,  2.4060779e+00,
              -3.3615762e-01,  8.5167992e-01, -7.8976321e-01, -3.2210040e+00,
               4.2455801e-01, -1.2451686e+00, -3.3446300e+00, -9.6654207e-01,
              -3.1781318e+00,  3.0578160e-01,  7.8434002e-01, -2.5863805e+00,
              -3.5231996e-01,  1.7134335e+00,  2.0300198e-01, -7.1936792e-01,
              -2.4807200e+00, -1.0818081e+00, -1.7563000e+00,  3.8921280e+00,
              -9.1756999e-01,  2.7569757e+00,  3.9237480e+00,  2.5449159e+00,
              -1.7760119e-01,  7.6070398e-01, -1.2017620e+00, -1.4514040e+00,
              -3.9943397e+00,  4.3353801e+00,  4.1867418e+00, -4.6196380e+00,
               6.7263210e-01,  4.9385601e-01,  1.1036839e+00,  1

In [16]:
#X_test

In [17]:
X_train.shape

(52041,)

In [18]:
X_test.shape

(13011,)

In [19]:
X_train_2d=np.stack(X_train)
X_test_2d=np.stack(X_test)

In [20]:
X_train_2d

array([[-1.2168788 , -0.8604599 , -0.685286  , ..., -0.66996604,
        -3.265548  ,  0.90116197],
       [-3.1610148 , -0.76352566, -1.395447  , ..., -0.8310771 ,
         0.46316713, -0.08288299],
       [-0.41141912,  1.0408256 , -1.7842009 , ...,  0.01384284,
        -2.0442202 ,  0.9948657 ],
       ...,
       [ 0.23951142, -0.92262745, -2.0379424 , ..., -1.6925001 ,
        -1.2248925 ,  2.2218273 ],
       [-0.9130081 , -0.86750877, -3.7864664 , ..., -0.18380515,
        -3.2174647 , -0.45928752],
       [-1.6393657 , -0.49172163, -3.7761602 , ...,  0.35973835,
        -2.8442519 ,  1.5646001 ]], dtype=float32)

In [21]:
clf=MultinomialNB()

In [22]:
scaler=MinMaxScaler()
scaled_train_embed=scaler.fit_transform(X_train_2d)
scaled_test_embed=scaler.transform(X_test_2d)
clf.fit(scaled_train_embed, y_train)

MultinomialNB()

In [23]:
y_pred=clf.predict(scaled_test_embed)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.76      0.77      6451
           1       0.77      0.78      0.77      6560

    accuracy                           0.77     13011
   macro avg       0.77      0.77      0.77     13011
weighted avg       0.77      0.77      0.77     13011



In [25]:
print(confusion_matrix(y_test,y_pred))

[[4890 1561]
 [1440 5120]]


In [26]:
acc=accuracy_score(y_test,y_pred)
print(acc*100)

76.9349012374145


In [27]:
dlf=KNeighborsClassifier(n_neighbors=5, metric='euclidean')
dlf.fit(scaled_train_embed, y_train)

KNeighborsClassifier(metric='euclidean')

In [28]:
y_pred=dlf.predict(scaled_test_embed)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94      6451
           1       0.96      0.92      0.94      6560

    accuracy                           0.94     13011
   macro avg       0.94      0.94      0.94     13011
weighted avg       0.94      0.94      0.94     13011



In [30]:
print(confusion_matrix(y_test,y_pred))

[[6224  227]
 [ 500 6060]]


In [31]:
print((accuracy_score(y_test,y_pred)*100))

94.41242025978018


In [33]:
import pickle

In [33]:
pickle.dump(dlf, open('gm_bhv_thy.pkl','wb'))

In [34]:
loaded_model=pickle.load(open('gm_bhv_thy.pkl', 'rb'))

In [35]:
scaler=MinMaxScaler()
scaled_train_embed=scaler.fit_transform(X_train_2d)
scaled_test_embed=scaler.transform(X_test_2d)

In [36]:
pickle.dump(scaler, open('min_max_gm.pkl','wb'))

In [37]:
load_min_max=pickle.load(open('min_max_gm.pkl','rb'))

In [38]:
text= "attack belgium"
sr=pd.Series([text])
vec=sr.apply(lambda x: nlp(x).vector)
vec_2d=np.stack(vec)
vec_scaled=load_min_max.transform(vec_2d)
#vec_scaled

In [39]:
pred=loaded_model.predict(vec_scaled)
pred

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array([0], dtype=int64)