### Install Dependencies


In [None]:
!pip install gensim
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# read the datset
data = pd.read_csv('twitter_training.csv', header = None)

In [None]:
data.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
data.columns = ['v1', 'v2', 'label', 'tweet']

In [None]:
data.head()

Unnamed: 0,v1,v2,label,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
df = data[['tweet', 'label']]

In [None]:
df.head()

Unnamed: 0,tweet,label
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


## Exploratory Data Analysis

In [None]:
df['label'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Negative,22542
Positive,20832
Neutral,18318
Irrelevant,12990


In [None]:
df.shape

(74682, 2)

In [None]:
## checking for missing values
df.isnull().sum()


Unnamed: 0,0
tweet,686
label,0


In [None]:
## dropping missing values
df = df.dropna()
print("tweet shape: ", df['tweet'].shape)
print("label shape: ", df['label'].shape)

tweet shape:  (73996,)
label shape:  (73996,)


In [None]:
## checking for duplicated values
df.duplicated().sum()

4227

In [None]:
## dropping duplicate values
df = df.drop_duplicates()
print("tweet entries: ", df['tweet'].shape)
print("label entries: ", df['label'].shape)

tweet entries:  (69769,)
label entries:  (69769,)


## Preprocessing and Cleaning

In [None]:
## Preprocessing the words along with lemmatization
from nltk import word_tokenize
stop_words = set(stopwords.words('english'))
lm = WordNetLemmatizer()

def preprocess_text(text):
  text = re.sub('[^a-z A-Z 0-9]+', '', text) ## removes anything other than alphanumeric
  text = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', text) ## removes any url present
  text = text.lower()
  words = word_tokenize(text)
  lemmatized_words = [lm.lemmatize(word) for word in words if word not in stop_words]
  return ' '.join(lemmatized_words)


In [None]:
## clean the tweets
df['tweet'] = df['tweet'].apply(preprocess_text)

In [None]:
print("tweet entries: ", df['tweet'].shape)
print("label entries: ", df['label'].shape)

tweet entries:  (69769,)
label entries:  (69769,)


## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
print(X_train.dtype)

object


In [None]:
X_train.shape

(55815,)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
le.classes_ = ["irrelevant", "negative", "neutral", "positive"]  # Setting the encoding order
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.fit_transform(y_test)

## Word Embeddings

In [None]:
from gensim.models import Word2Vec, KeyedVectors


In [None]:
X_train_tokenized = [tweet.split() for tweet in X_train]
X_test_tokenized = [tweet.split() for tweet in X_test]

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [None]:
#model = Word2Vec(X_train_tokenized, vector_size=100, window=7, epochs=10)jjoj

In [None]:
## creating the avg word2vec model
def avg_word2vec(words):
    word_vectors = [wv[word] for word in words if word in wv.index_to_key]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(wv.vector_size)

In [None]:
!pip install tqdm
from tqdm import tqdm



In [None]:
X_train_new = []
for i in tqdm(range(len(X_train_tokenized))):
  X_train_new.append(avg_word2vec(X_train_tokenized[i]))

100%|██████████| 55815/55815 [1:29:48<00:00, 10.36it/s]


In [None]:
X_test_new = []
for i in tqdm(range(len(X_test_tokenized))):
  X_test_new.append(avg_word2vec(X_test_tokenized[i]))

100%|██████████| 13954/13954 [22:09<00:00, 10.49it/s]


In [None]:
X_train_vec = np.array(X_train_new)

In [None]:
X_test_vec = np.array(X_test_new)

## Training the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [None]:
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000)).fit(X_train_vec, y_train_encoded)

In [None]:
y_pred_encoded = clf.predict(X_test_vec)

## Results


In [None]:
from sklearn.metrics import accuracy_score, classification_report


In [None]:
print("Accuracy: ", accuracy_score(y_pred_encoded, y_test_encoded))

Accuracy:  0.5402035258707181


In [None]:
print(classification_report(y_pred_encoded, y_test_encoded))

              precision    recall  f1-score   support

           0       0.19      0.47      0.27       980
           1       0.72      0.56      0.63      5381
           2       0.47      0.51      0.49      3214
           3       0.64      0.56      0.59      4379

    accuracy                           0.54     13954
   macro avg       0.50      0.52      0.49     13954
weighted avg       0.60      0.54      0.56     13954



In [None]:
from sklearn.ensemble import RandomForestClassifier


classifier = RandomForestClassifier(n_estimators=500, random_state=42)
classifier.fit(X_train_vec, y_train_encoded)
y_pred_encoded_rf = classifier.predict(X_test_vec)

In [None]:
accuracy_score(y_pred_encoded_rf, y_test_encoded)

0.7668052171420381

In [None]:
print(classification_report(y_pred_encoded_rf, y_test_encoded))

              precision    recall  f1-score   support

           0       0.49      0.96      0.65      1259
           1       0.90      0.72      0.80      5210
           2       0.75      0.77      0.76      3340
           3       0.82      0.76      0.79      4145

    accuracy                           0.77     13954
   macro avg       0.74      0.80      0.75     13954
weighted avg       0.80      0.77      0.77     13954



In [None]:
import xgboost as xgb

In [None]:
xgb_classifier = xgb.XGBClassifier(max_depth=6, n_estimators=500, colsample_bytree=0.8, objective='multi:softmax', num_class=4, eval_metric='mlogloss', use_label_encoder=False)
xgb_classifier.fit(X_train_vec, y_train_encoded)

Parameters: { "use_label_encoder" } are not used.



In [None]:
y_pred_encoded_xgb = xgb_classifier.predict(X_test_vec)

In [None]:
accuracy_score(y_pred_encoded_xgb, y_test_encoded)

0.8239214562132722

In [None]:
print(classification_report(y_pred_encoded_xgb, y_pred_encoded))

              precision    recall  f1-score   support

           0       0.47      0.23      0.31      1980
           1       0.66      0.78      0.72      4566
           2       0.62      0.58      0.60      3441
           3       0.65      0.72      0.68      3967

    accuracy                           0.64     13954
   macro avg       0.60      0.58      0.58     13954
weighted avg       0.62      0.64      0.62     13954

