In [1]:
%pip install pandas
%pip install numpy
%pip install spacy
%pip install -U scikit-learn
%pip install seaborn
%pip install transformers
%pip install fasttext
%pip install lazypredict
%pip install gensim

Collecting fasttext
  Using cached fasttext-0.9.2.tar.gz (68 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4393309 sha256=51e48ee4e7e5cbb0935144b5d8f1637ce1e3cfd4ef82718a674e9e2930c5ebf3
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold
import fasttext
import fasttext.util
from lazypredict.Supervised import LazyClassifier
from transformers import BertTokenizer
import gensim
import gensim.downloader

In [2]:
davidson = pd.read_csv("davidson_sentiment_score.csv")
davidson

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,care.virtue,care.vice,fairness.virtue,...,loyalty.virtue,loyalty.vice,authority.virtue,authority.vice,sanctity.virtue,sanctity.vice,cleaned_tweets,positive,neutral,negative
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,0,0,...,0,0,0,0,1,1,rt mayasolovely woman complain clean house am...,0.07,0.41,0.52
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0,0,0,...,0,0,0,0,0,0,rt mleew boy dat cold tyga dwn bad cuffin d...,0.03,0.43,0.54
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0,0,0,...,0,0,0,0,0,2,rt urkindofbrand dawg rt sbaby life fuck b...,0.00,0.03,0.97
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0,0,0,...,0,0,0,0,0,0,rt c g anderson viva based look like tranny,0.06,0.73,0.21
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0,0,0,...,0,0,0,0,0,1,rt shenikaroberts shit hear true faker bitch ...,0.01,0.19,0.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,0,0,0,...,0,0,0,0,1,1,muthaf in lie lifeasking pearls c...,0.01,0.12,0.87
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an...",0,0,0,...,0,0,0,0,0,0,go break wrong heart baby drive redneck crazy,0.01,0.10,0.89
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...,0,0,0,...,0,0,0,0,0,0,young buck wanna eat dat nigguh like be nt fuc...,0.04,0.39,0.57
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies,0,0,0,...,0,0,0,0,0,0,youu got wild bitch tellin lie,0.01,0.11,0.88


In [9]:
from pathlib import Path

def getTrainSetFastText():
    # ft_model = fasttext.load_model(str(Path("models/crawl-300d-2M-subword.bin")))
    #fasttext.util.download_model('en', if_exists='ignore')  #
    print("downloaded model \n Starting loading model")
    ft_model = fasttext.load_model('cc.en.300.bin')
    print("loaded model \n Starting reducing model")
    # fasttext.util.reduce_model(ft_model, 100)
    print("reduced model")
    traindata = []
    for tweet in davidson['cleaned_tweets']:
        traindata.append(ft_model.get_sentence_vector(tweet.replace("\n", "")))
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    traindata["positive"] = davidson["positive"].values.tolist()
    traindata["neutral"] = davidson["neutral"].values.tolist()
    traindata["negative"] = davidson["negative"].values.tolist()
    return traindata

def getTrainSetTFIDF():
    countvec = CountVectorizer(max_features=100)
    bow = countvec.fit_transform(davidson['tweet']).toarray()
    tfidfconverter = TfidfTransformer()
    X = tfidfconverter.fit_transform(bow).toarray()
    training_data = pd.DataFrame(X)
    training_data.columns = training_data.columns.astype(str)
    training_data["positive"] = davidson["positive"].values.tolist()
    training_data["neutral"] = davidson["neutral"].values.tolist()
    training_data["negative"] = davidson["negative"].values.tolist()
    return training_data

def getTrainSetBERT():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized_data = tokenizer(davidson['cleaned_tweets'].tolist(), padding=True, truncation=True, max_length=100)
    traindata = []
    for msg in tokenized_data['input_ids']:
        traindata.append(msg)
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    traindata["positive"] = davidson["positive"].values.tolist()
    traindata["neutral"] = davidson["neutral"].values.tolist()
    traindata["negative"] = davidson["negative"].values.tolist()
    return traindata

def getTrainSetWord2Vec():
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('word2vec-google-news-300.bin', binary=True)
    traindata = []
    for msg in davidson['tweet']:
        words = msg.split()
        vecs = []
        for word in words:
            if word in w2v_model:
                vecs.append(w2v_model[word][:100])
        if vecs:
            vec_avg = sum(vecs) / len(vecs)
        else:
            vec_avg = [0] * 100
        traindata.append(vec_avg)

    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    traindata["positive"] = davidson["positive"].values.tolist()
    traindata["neutral"] = davidson["neutral"].values.tolist()
    traindata["negative"] = davidson["negative"].values.tolist()
    return traindata

def getTrainSetGlove():
    glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')
    traindata = []
    for msg in davidson['cleaned_tweets']:
        words = msg.split()
        vecs = []
        for word in words:
            if word in glove_vectors:
                vecs.append(glove_vectors[word])
        if vecs:
            vec_avg = sum(vecs) / len(vecs)
        else:
            vec_avg = [0] * 100
        traindata.append(vec_avg)

    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)

    traindata["positive"] = davidson["positive"].values.tolist()
    traindata["neutral"] = davidson["neutral"].values.tolist()
    traindata["negative"] = davidson["negative"].values.tolist()
    return traindata

In [16]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
! gunzip cc.en.300.bin.gz

--2023-07-09 19:33:02--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.249.39.6, 13.249.39.89, 13.249.39.25, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.249.39.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2023-07-09 19:33:27 (175 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [None]:
X = getTrainSetFastText()
X

In [8]:
X = getTrainSetBERT()
X

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,positive,neutral,negative
0,101,19387,9815,19454,21818,2135,2450,17612,4550,2160,...,0,0,0,0,0,0,0,0.07,0.41,0.52
1,101,19387,19875,4402,2860,2879,23755,3147,5939,3654,...,0,0,0,0,0,0,0,0.03,0.43,0.54
2,101,19387,24471,18824,11253,23544,4830,27767,19387,24829,...,0,0,0,0,0,0,0,0.00,0.03,0.97
3,101,19387,1039,1043,5143,20022,2241,2298,2066,25283,...,0,0,0,0,0,0,0,0.06,0.73,0.21
4,101,19387,21882,7556,3217,8296,2015,4485,2963,2995,...,0,0,0,0,0,0,0,0.01,0.19,0.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24778,101,14163,8322,2546,1999,4682,2166,19895,2075,21944,...,0,0,0,0,0,0,0,0.01,0.12,0.87
24779,101,2175,3338,3308,2540,3336,3298,2417,18278,4689,...,0,0,0,0,0,0,0,0.01,0.10,0.89
24780,101,2402,10131,10587,4521,23755,9152,13871,27225,2066,...,0,0,0,0,0,0,0,0.04,0.39,0.57
24781,101,2017,2226,2288,3748,7743,2425,2378,4682,102,...,0,0,0,0,0,0,0,0.01,0.11,0.88


In [10]:
X = getTrainSetGlove()
X



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,positive,neutral,negative
0,-0.11,0.31,0.35,-0.49,-0.07,0.55,-0.28,0.41,-0.01,0.26,...,-0.04,-0.28,0.10,-0.17,-0.13,0.49,-0.17,0.07,0.41,0.52
1,0.09,-0.02,0.22,-0.41,-0.36,0.09,-0.06,0.10,0.06,-0.08,...,0.25,0.12,-0.22,-0.19,-0.05,0.19,-0.33,0.03,0.43,0.54
2,0.04,0.08,0.33,-0.47,-0.28,0.48,-0.20,-0.01,0.03,-0.19,...,0.19,0.13,-0.27,-0.11,0.01,0.13,-0.03,0.00,0.03,0.97
3,-0.08,0.32,0.11,-0.39,-0.00,0.05,0.07,0.11,-0.70,-0.00,...,0.03,0.06,0.03,-0.10,-0.28,0.17,-0.08,0.06,0.73,0.21
4,0.00,0.06,0.47,-0.53,-0.34,0.24,-0.06,-0.07,0.26,-0.06,...,-0.09,0.03,-0.31,-0.37,-0.24,0.09,-0.02,0.01,0.19,0.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24778,0.00,0.40,0.20,-0.38,0.02,0.43,0.08,0.04,-0.09,0.06,...,0.05,-0.21,-0.09,-0.31,-0.25,0.20,0.38,0.01,0.12,0.87
24779,0.08,0.12,0.50,-0.44,-0.31,0.41,-0.06,0.27,0.14,0.02,...,-0.04,-0.45,-0.27,-0.01,-0.03,0.22,0.08,0.01,0.10,0.89
24780,-0.08,0.28,0.22,-0.41,-0.41,0.17,-0.02,0.02,0.32,-0.20,...,-0.17,-0.10,-0.39,-0.12,-0.22,0.34,0.10,0.04,0.39,0.57
24781,0.04,0.13,0.80,-0.28,-0.26,0.26,0.08,0.06,0.10,-0.46,...,-0.12,-0.01,-0.20,-0.41,-0.02,0.05,0.12,0.01,0.11,0.88
