In [123]:
import pandas as pd
from pycontractions  import Contractions
import string
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from flair.models import SequenceTagger, MultiTagger
from flair.data import Sentence
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords

# from stop_words import get_stop_words

In [124]:
# stop = stopwords.words('english')
# print(stop)

In [125]:
# stop_words = get_stop_words('en')
# print(stop_words)

# Read Tweets Data

In [126]:
df = pd.read_csv('./data/cleandata.csv', parse_dates=['Date'],encoding = "utf-8")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2668 entries, 0 to 2667
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Tweets          2668 non-null   object        
 1   Retweets        2668 non-null   int64         
 2   Likes           2668 non-null   int64         
 3   Date            2668 non-null   datetime64[ns]
 4   Cleaned_Tweets  2668 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 104.3+ KB


In [127]:
def get_basic_info(dataframe):
    cols_list = dataframe.columns.tolist()
    data_types = df.dtypes    
    
    for col in cols_list:        
        # get the number of unique entries for each column
        no_unique_value = dataframe[col].nunique()
        print(f'The number of unique values for column {col} is {no_unique_value}')

        # Check for the data type and get the min and max value
        if data_types[col] != 'object':
            min_value = dataframe[col].min()
            max_value = dataframe[col].max()
            print(f'The min value for column {col} is {min_value}')
            print(f'The max value for column {col} is {max_value}\n')
        else:
            print('\n')       

In [128]:
get_basic_info(df)

The number of unique values for column Tweets is 2642


The number of unique values for column Retweets is 1834
The min value for column Retweets is 41
The max value for column Retweets is 681707

The number of unique values for column Likes is 2598
The min value for column Likes is 933
The max value for column Likes is 4780787

The number of unique values for column Date is 2668
The min value for column Date is 2022-01-27 21:00:09
The max value for column Date is 2022-10-27 16:17:39

The number of unique values for column Cleaned_Tweets is 2382




# Data Cleaning

In [129]:
cont = Contractions('GoogleNews-vectors-negative300.bin')
cont.load_models()

In [130]:
import re

def clean(text):
    text = text.strip()
    # Fix quotes
    text = text.replace("’", "'") \
        .replace("‘", "'") \
        .replace("”", '"') \
        .replace("“", '"')

    # Replace &amp; with and
    text = text.replace('&amp;', 'and')

    # Fix sentences which does not have space after full stop
    text = text.replace('.', '. ')
    
    # Fix contractions
    text = list(cont.expand_texts([text], precise=True))[0]

    # Remove punctuations
    text = "".join([i for i in text if i not in string.punctuation])
    return text

df['Cleaned_Tweets'] = df['Cleaned_Tweets'].apply(clean)
df['Cleaned_Tweets'].head()

0                                           thanks
1                                       Absolutely
2                         Dear Twitter Advertisers
3    Meeting a lot of cool people at Twitter today
4           Entering Twitter HQ – let that sink in
Name: Cleaned_Tweets, dtype: object

In [131]:
df['Token_Counts'] = df['Cleaned_Tweets'].apply(lambda x: len(x.split(' ')))

In [132]:
# Remove tweets with less than 3 tokens
df = df[df['Token_Counts'] > 2].reset_index(drop=True).copy()

In [133]:
df.shape

(2094, 6)

# Emotions Classification

In [134]:
# Instantiate model pipeline
model = AutoModelForSequenceClassification.from_pretrained(
    "Emanuel/bertweet-emotion-base"
)
tokenizer = AutoTokenizer.from_pretrained(
    "Emanuel/bertweet-emotion-base"
)
device = -1 #torch.cuda.current_device() if torch.cuda.is_available else -1
model_pipeline = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device
)

In [135]:
df['Emotion_Scores'] = model_pipeline(df['Cleaned_Tweets'].to_list(), top_k=None)
df['Emotion_Scores'].head()

0    [{'label': 'anger', 'score': 0.704805850982666...
1    [{'label': 'joy', 'score': 0.9869217872619629}...
2    [{'label': 'joy', 'score': 0.8734441995620728}...
3    [{'label': 'sadness', 'score': 0.3829310834407...
4    [{'label': 'anger', 'score': 0.415039986371994...
Name: Emotion_Scores, dtype: object

In [136]:
# Assign top 2 emotions
df['Emotion1'] = df['Emotion_Scores'].apply(lambda x: x[0]['label'])
df['Emotion1'].head()

Unnamed: 0,Emotion1,Emotion2
0,anger,joy
1,joy,love
2,joy,anger
3,sadness,anger
4,anger,sadness


# Topic Extraction

In [137]:
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT

In [138]:
kw_model = KeyBERT()
vectorizer = KeyphraseCountVectorizer(pos_pattern='<NNP.*>+')
df['Noun_Keyphrases_Score'] = kw_model.extract_keywords(docs=df['Cleaned_Tweets'].to_list(), vectorizer=vectorizer, stop_words='english', top_n=5)
df['Noun_Keyphrases'] = df['Noun_Keyphrases_Score'].apply(lambda record: [x[0] for x in record])

In [139]:
df['Noun_Keyphrases_Score'].loc[0]

[('twitter', 0.6289)]

In [140]:
df['Noun_Keyphrases']

0                         [twitter]
1                    [twitter, lot]
2       [twitter hq, twitter, sink]
3                         [twitter]
4                                []
                   ...             
2089                    [manganese]
2090              [manganese, iron]
2091                             []
2092                       [office]
2093                             []
Name: Noun_Keyphrases, Length: 2094, dtype: object

In [141]:
# Converting score types for serialization
def fix_float_type(input):
    return [(x[0], str(x[1])) for x in input]
df['Noun_Keyphrases_Score'] = df['Noun_Keyphrases_Score'].apply(fix_float_type)

In [142]:
from collections import Counter

keyphrases = []
for entry in df['Noun_Keyphrases'].values:
    keyphrases += entry
Counter(keyphrases)


Counter({'twitter': 86,
         'lot': 33,
         'twitter hq': 1,
         'sink': 5,
         'wish': 6,
         'new york times': 1,
         'vitalik': 1,
         'booster': 16,
         'silicon valley': 1,
         'polytopia': 3,
         'war': 22,
         'life': 28,
         'kasparov': 1,
         'chess': 4,
         'iphone': 2,
         'blue': 2,
         'time': 59,
         'starlink': 72,
         'dod': 4,
         'spacex': 48,
         'thingreal': 1,
         'wapo': 4,
         'neuralink': 4,
         'nov': 1,
         'long': 25,
         'change': 9,
         'belgium': 1,
         'switzerland': 1,
         'beta': 26,
         'next': 41,
         'randd': 2,
         'tbh': 8,
         'gps': 4,
         'headline': 1,
         'signal': 12,
         'un ass': 1,
         'un': 5,
         'giga berlin': 4,
         'giga': 13,
         'tesla': 138,
         'kremlin': 1,
         'putin': 2,
         'peace': 7,
         'nice': 11,
         'ukrai

In [143]:
# load tagger
tagger = SequenceTagger.load("flair/pos-english")

def flair_pos_tagging(sentence):
    # print(sentence)
    verbs = set()
    adjectives = set()
    sen = Sentence(sentence)
    tagger.predict(sen)

    for label in sen.get_labels('pos'):
        
        if label.value[0:2] == 'VB' and label.score > 0.75:
            verbs.add(label.data_point.text)
            # print(verbs)
        if label.value[0:2] == 'JJ' and label.score > 0.75:
            adjectives.add(label.data_point.text)
            # print(adjectives)

    return list(verbs), list(adjectives)

df['verbs'], df['adjectives'] = zip(*df['Cleaned_Tweets'].str.lower().apply(flair_pos_tagging))

2022-11-29 13:36:20,321 loading file C:\Users\baira\.flair\models\pos-english\a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-11-29 13:36:21,134 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


In [144]:
df[['verbs','adjectives']]

Unnamed: 0,verbs,adjectives
0,[],[dear]
1,[meeting],[cool]
2,"[let, sink, entering]",[]
3,"[get, are]","[underappreciated, local]"
4,[bats],[]
...,...,...
2089,"[requires, operates]","[less, higher]"
2090,"[scaling, is]",[several]
2091,"[paid, are]","[high, responsible]"
2092,"[be, voted]",[]


In [174]:
verbs_to_remove = ['get','are','is','am','have','has','been','seen','had','do','took','be',
                    'make','does','like','did','see','was','go','got','get','want','getting','gets', 'exist',
                    'done','doing','went','uses','says','known','let','given' ,'gave','makes','goes',
                    'gone','going','saw','being','were']

def remove_words(row):
    verbs_list = []

    if len(row) > 0:
        for i in row:
            if i not in verbs_to_remove:
                verbs_list.append(i)
    return verbs_list                  

df['verbs'] = df['verbs'].apply(remove_words)

In [175]:
# from collections import Counter
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# keyphrases = df1['verbs'].explode().dropna().to_list()
# word_could_dict = Counter(keyphrases)
# wordcloud = WordCloud(width = 1000, height = 500, background_color = "white").generate_from_frequencies(word_could_dict)
# # Word Cloud Visual
# plt.figure(figsize=(10,14))
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show()

In [176]:
adj_to_remove = ['many','most','much','such']

def remove_words(row):
    adj_list = []

    if len(row) > 0:
        for i in row:
            if i not in adj_to_remove:
                adj_list.append(i)
    return adj_list                  

df['adjectives'] = df['adjectives'].apply(remove_words)

In [177]:
# from collections import Counter
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# keyphrases = df1['adjectives'].explode().dropna().to_list()
# word_could_dict = Counter(keyphrases)
# wordcloud = WordCloud(width = 1000, height = 500, background_color = "white").generate_from_frequencies(word_could_dict)
# # Word Cloud Visual
# plt.figure(figsize=(10,14))
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.show()

In [184]:
noun_to_remove = ['ft', 'tbc', 'gt50' , '50year' , 'bro' , 'lt5', 'україна', 'thursday', '110k', 'friday' , 'tbh', 'qa', 'haha' ,
                 'ac', 'november', 'sept' , 'lhd', 'rhd', 'вы', 'bf' , 'fsd' , 'us', 'fud', 'gwhdaykm2', 'irs', 'lib', 'libs', 'mgmt',
                 'умом без сердца', 'fps', 'venn', 'cnc', 'hz' , 'def', '8k' , 'nighti', 'encanta el mariachi', 'forward10', 'ich',
                 'isp', 'comme ci comme', 'la', 'ludendorffs', 'july', 'ðoge', 'siegel', 'zaporizhzhia', 'h2', 'ron barron', 'sjm', 'doj',
                 'talulah', 'starlink eg league', 'areof pastpresentand future', 'mr president', 'bs', 'el camino jack', 'jb', 'nyt',
                 'честью', 'das bootdas baby', 'zukunft', 'gnus', 'yang', 'august', 'reps', 'btw', 'wsj', 'covid19 anymorei',
                 'ozempicrybelsus', 'дура', 'charlie ergen', 'dec', 'october', '1k', '100mw', 'mdma', 'un ass', 'donbas', 'gwynne',
                 'ppmgt1000', 'же несчастная дура', 'awe', 'mf', 'kbg', 'nn', 'ch4', 'kg', 'bf16', 'thingreal', 'uaw', 'pshaw',
                 'на этот', 'faa', 'row iv', 'ev', 'gm', 'kph', 'haha', 'b7', 'may', 'zatko', 'eu', 'un', 'nv', 'tw', 'gt95', 'mps',
                 'q2', 'ur', 'путин', 'agoford', 'leavei', 'vw', 'ferdinand piëch', 'giga berlinbrandenburg', 'satan', 'das baby',
                 'cape canaveralhumans', 'chad', 'hijinks']

def remove_words(row):
    noun_list = []

    if len(row) > 0:
        for i in row:
            if i not in noun_to_remove:
                noun_list.append(i)
    return noun_list                  

df['Noun_Keyphrases'] = df['Noun_Keyphrases'].apply(remove_words)                 

In [189]:
df.to_parquet('./data/processed_data.parquet', index=False)