In [1]:
import os

In [2]:
os.chdir("/content/drive/MyDrive/nlp")

In [3]:
import pandas as pd
import numpy as np
import re
import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df = pd.read_csv("Tweets.csv")

In [6]:
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [7]:
df1 = df[["text","sentiment"]]

In [8]:
df1 = pd.DataFrame(df1)

In [9]:
df1.isnull().sum()

Unnamed: 0,0
text,1
sentiment,0


In [10]:
df1.dropna(inplace=True)

In [11]:
df1.isnull().sum()

Unnamed: 0,0
text,0
sentiment,0


In [12]:
def remove_s_chr_lcase(text):
  text = str(text)
  text = text.lower()
  return re.sub(r'[^a-zA-Z0-9\s]','',text)

In [13]:
df1["text"]= df1["text"].apply(remove_s_chr_lcase)


In [14]:
df1

Unnamed: 0,text,sentiment
0,id have responded if i were going,neutral
1,sooo sad i will miss you here in san diego,negative
2,my boss is bullying me,negative
3,what interview leave me alone,negative
4,sons of why couldnt they put them on the rel...,negative
...,...,...
27476,wish we could come see u on denver husband l...,negative
27477,ive wondered about rake to the client has ma...,negative
27478,yay good for both of you enjoy the break you...,positive
27479,but it was worth it,positive


In [15]:
def tokenization(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [16]:
df1["text"] = df1["text"].apply(tokenization)

In [17]:
df1

Unnamed: 0,text,sentiment
0,"[ , i, d, have, responded, if, i, were, going]",neutral
1,"[ , sooo, sad, i, will, miss, you, here, in, s...",negative
2,"[my, boss, is, bullying, me]",negative
3,"[ , what, interview, leave, me, alone]",negative
4,"[ , sons, of, , why, could, nt, they, put, th...",negative
...,...,...
27476,"[ , wish, we, could, come, see, u, on, denver,...",negative
27477,"[ , i, ve, wondered, about, rake, to, , the, ...",negative
27478,"[ , yay, good, for, both, of, you, enjoy, the,...",positive
27479,"[ , but, it, was, worth, it, ]",positive


In [18]:
np.unique(df1["sentiment"])

array(['negative', 'neutral', 'positive'], dtype=object)

In [19]:
df1["labels"] = df1.sentiment.map({
    'negative' : 0,
    'positive' : 1,
    'neutral' : 2
})

In [20]:
from gensim.models import Word2Vec
from sklearn.neural_network import BernoulliRBM
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, recall_score, precision_score

In [21]:

corpus = df1["text"].tolist()
word2vec = Word2Vec(sentences=corpus, vector_size=150, window=5, min_count=2, workers=4)

In [22]:
def get_document_embedding(document, model, vector_size):
    word_vectors = [model.wv[word] for word in document if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(vector_size)

In [23]:
vector_size = word2vec.vector_size
df1['embedding'] = df1["text"].apply(lambda x: get_document_embedding(x, word2vec, vector_size))

In [24]:
embeddings = np.array(df1['embedding'].tolist())

In [25]:
df1

Unnamed: 0,text,sentiment,labels,embedding
0,"[ , i, d, have, responded, if, i, were, going]",neutral,2,"[0.34887418, -0.18766564, -0.089167856, -0.205..."
1,"[ , sooo, sad, i, will, miss, you, here, in, s...",negative,0,"[0.16367228, -0.26197803, -0.08938636, -0.0007..."
2,"[my, boss, is, bullying, me]",negative,0,"[-0.022151, -0.21027364, -0.00802239, -0.29809..."
3,"[ , what, interview, leave, me, alone]",negative,0,"[-0.0068587265, -0.0029722154, 0.049235445, 0...."
4,"[ , sons, of, , why, could, nt, they, put, th...",negative,0,"[-0.09647633, -0.15030925, -0.02046429, -0.079..."
...,...,...,...,...
27476,"[ , wish, we, could, come, see, u, on, denver,...",negative,0,"[0.058767714, -0.12476827, 0.017202621, -0.176..."
27477,"[ , i, ve, wondered, about, rake, to, , the, ...",negative,0,"[0.08052339, -0.12864532, -0.0036456948, -0.16..."
27478,"[ , yay, good, for, both, of, you, enjoy, the,...",positive,1,"[0.17066072, -0.15835725, -0.034231633, -0.019..."
27479,"[ , but, it, was, worth, it, ]",positive,1,"[0.36979705, -0.26786336, -0.025910811, -0.052..."


In [26]:
labels = df1['labels'].values

In [47]:

rbm_layers = []


embeddings = np.array(embeddings)

n_layers = 5
n_components_list = [150, 128, 100, 84, 64]

#Training of the first RBM on the original embeddings
rbm = BernoulliRBM(n_components=n_components_list[0], learning_rate=0.01, n_iter=10, random_state=42)
rbm.fit(embeddings)
rbm_layers.append(rbm)


transformed_data = rbm.transform(embeddings)


In [48]:
for i in range(1, n_layers):
    rbm = BernoulliRBM(n_components=n_components_list[i], learning_rate=0.01, n_iter=10, random_state=42)
    rbm.fit(transformed_data)
    rbm_layers.append(rbm)
    transformed_data = rbm.transform(transformed_data)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(transformed_data,
                                                    labels,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=labels)

In [50]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)


y_pred = sgd.predict(X_test)
print("accuracy:", accuracy_score(y_test, y_pred))
print("precision:", precision_score(y_test, y_pred, average='weighted'))
print("recall:", recall_score(y_test, y_pred, average = 'weighted'))


accuracy: 0.4044759825327511
precision: 0.16360082044583435
recall: 0.4044759825327511


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
