In [30]:
import pandas as pd
from pycontractions  import Contractions
import string
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from flair.models import SequenceTagger
from flair.data import Sentence
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read Tweets Data

In [31]:
df = pd.read_csv('./data/cleandata.csv', parse_dates=['Date'],encoding = "utf-8")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2668 entries, 0 to 2667
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Tweets          2668 non-null   object        
 1   Retweets        2668 non-null   int64         
 2   Likes           2668 non-null   int64         
 3   Date            2668 non-null   datetime64[ns]
 4   Cleaned_Tweets  2668 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 104.3+ KB


In [32]:
def get_basic_info(dataframe):
    cols_list = dataframe.columns.tolist()
    data_types = df.dtypes    
    
    for col in cols_list:        
        # get the number of unique entries for each column
        no_unique_value = dataframe[col].nunique()
        print(f'The number of unique values for column {col} is {no_unique_value}')

        # Check for the data type and get the min and max value
        if data_types[col] != 'object':
            min_value = dataframe[col].min()
            max_value = dataframe[col].max()
            print(f'The min value for column {col} is {min_value}')
            print(f'The max value for column {col} is {max_value}\n')
        else:
            print('\n')       

In [33]:
get_basic_info(df)

The number of unique values for column Tweets is 2642


The number of unique values for column Retweets is 1834
The min value for column Retweets is 41
The max value for column Retweets is 681707

The number of unique values for column Likes is 2598
The min value for column Likes is 933
The max value for column Likes is 4780787

The number of unique values for column Date is 2668
The min value for column Date is 2022-01-27 21:00:09
The max value for column Date is 2022-10-27 16:17:39

The number of unique values for column Cleaned_Tweets is 2382




# Data Cleaning

In [34]:
cont = Contractions('GoogleNews-vectors-negative300.bin')
cont.load_models()

In [35]:
import re

df['Cleaned_Tweets'] = df['Cleaned_Tweets'].replace({r'[^\x00-\x7F]+':''}, regex=True)

def clean(text):
    text = text.strip()
    # Fix quotes
    text = text.replace("’", "'") \
        .replace("‘", "'") \
        .replace("”", '"') \
        .replace("“", '"')

    # Replace &amp; with and
    text = text.replace('&amp;','and')

    text = text.replace('&gt;', 'greater than ')

    # Fix sentences which does not have space after full stop
    text = text.replace('.', '. ')

    text = text.replace('-', '')
    
    # Fix contractions
    text = list(cont.expand_texts([text], precise=True))[0]

    # Remove punctuations
    # text = "".join([i for i in text if i not in string.punctuation]) 
    no_spaces = len(string.punctuation)
       
    text = text.translate(str.maketrans(string.punctuation,' ' * no_spaces))  
    return text

df['Cleaned_Tweets'] = df['Cleaned_Tweets'].apply(clean)
# Remove stopwords
stop = stopwords.words('english')
stop += ['im','ie','ete', 'dont', 'cant', 'would','wont','doesnt','must','might','also','almost','so', 'haha']
df['Cleaned_Tweets'] = df['Cleaned_Tweets'].str.lower().apply(lambda x : " ".join([word for word in x.split() if word not in stop]))

In [36]:
df['Cleaned_Tweets'].head()

0                                   thanks
1                               absolutely
2                 dear twitter advertisers
3    meeting lot cool people twitter today
4             entering twitter hq let sink
Name: Cleaned_Tweets, dtype: object

In [37]:
df.to_csv('./data/after_clean.csv',encoding = "utf-8")

In [38]:
df['Token_Counts'] = df['Cleaned_Tweets'].apply(lambda x: len(x.split(' ')))

In [39]:
# Remove tweets with less than 3 tokens
df = df[df['Token_Counts'] > 3].reset_index(drop=True).copy()

In [40]:
df.shape

(1551, 6)

# Emotions Classification

In [41]:
# Instantiate model pipeline
model = AutoModelForSequenceClassification.from_pretrained(
    "Emanuel/bertweet-emotion-base"
)
tokenizer = AutoTokenizer.from_pretrained(
    "Emanuel/bertweet-emotion-base"
)
device = -1 #torch.cuda.current_device() if torch.cuda.is_available else -1
model_pipeline = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device
)

In [42]:
df['Emotion_Scores'] = model_pipeline(df['Cleaned_Tweets'].to_list(), top_k=None)
df['Emotion_Scores'].head()

0    [{'label': 'joy', 'score': 0.9898011088371277}...
1    [{'label': 'joy', 'score': 0.7461026310920715}...
2    [{'label': 'anger', 'score': 0.441839307546615...
3    [{'label': 'joy', 'score': 0.7306174635887146}...
4    [{'label': 'anger', 'score': 0.632644712924957...
Name: Emotion_Scores, dtype: object

In [43]:
# Assign top 2 emotions
df['Emotion1'] = df['Emotion_Scores'].apply(lambda x: x[0]['label'])
df['Emotion1'].head()

0      joy
1      joy
2    anger
3      joy
4    anger
Name: Emotion1, dtype: object

# Topic Extraction

In [44]:
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT

In [45]:
kw_model = KeyBERT()
vectorizer = KeyphraseCountVectorizer(pos_pattern='<NNP.*>+')
df['Noun_Keyphrases_Score'] = kw_model.extract_keywords(docs=df['Cleaned_Tweets'].to_list(), vectorizer=vectorizer, stop_words='english', top_n=5)
df['Noun_Keyphrases'] = df['Noun_Keyphrases_Score'].apply(lambda record: [x[0] for x in record])

In [46]:
df['Noun_Keyphrases_Score'].loc[0]

[('twitter', 0.5599), ('cool', 0.2777)]

In [47]:
df['Noun_Keyphrases']

0         [twitter, cool]
1               [twitter]
2               [twitter]
3               [twitter]
4                   [fan]
              ...        
1546                   []
1547          [manganese]
1548    [manganese, iron]
1549                   []
1550          [cb radios]
Name: Noun_Keyphrases, Length: 1551, dtype: object

In [48]:
# Converting score types for serialization
def fix_float_type(input):
    return [(x[0], str(x[1])) for x in input]
df['Noun_Keyphrases_Score'] = df['Noun_Keyphrases_Score'].apply(fix_float_type)

In [49]:
from collections import Counter

keyphrases = []
for entry in df['Noun_Keyphrases'].values:
    keyphrases += entry
Counter(keyphrases)


Counter({'twitter': 84,
         'cool': 10,
         'fan': 7,
         'new york times': 1,
         'nonfake vitalik tweet': 1,
         'doubletake': 1,
         'fair': 3,
         'silicon valley': 1,
         'fog war': 2,
         'war': 19,
         'douche': 1,
         'house': 5,
         'kasparov': 1,
         'good': 57,
         'dod': 4,
         'algorithm': 9,
         'neuralink': 5,
         'nov': 1,
         'switzerland belgium': 1,
         'beta': 26,
         'release': 21,
         'tbh': 7,
         'gps': 4,
         'headline': 1,
         'world': 30,
         'un ass': 1,
         'un': 5,
         'giga berlin': 4,
         'many': 63,
         'tesla': 138,
         'putin': 2,
         'kremlin nice': 1,
         'russia overrunning ukraine': 1,
         'ukraine': 26,
         'russia': 22,
         'crimea': 10,
         'cuba': 1,
         'falcon': 16,
         'vox populi vox dei': 1,
         'hope': 7,
         'wrong analogy america humiliati

In [50]:
# load tagger
tagger = SequenceTagger.load("flair/pos-english")

def flair_pos_tagging(sentence):
    # print(sentence)
    verbs = set()
    adjectives = set()
    sen = Sentence(sentence)
    tagger.predict(sen)

    for label in sen.get_labels('pos'):
        
        if label.value[0:2] == 'VB' and label.score > 0.75:
            verbs.add(label.data_point.text)
            # print(verbs)
        if label.value[0:2] == 'JJ' and label.score > 0.75:
            adjectives.add(label.data_point.text)
            # print(adjectives)

    return list(verbs), list(adjectives)

df['verbs'], df['adjectives'] = zip(*df['Cleaned_Tweets'].str.lower().apply(flair_pos_tagging))

2022-12-04 19:11:40,504 loading file C:\Users\baira\.flair\models\pos-english\a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-12-04 19:11:41,334 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


In [51]:
df[['verbs','adjectives']]

Unnamed: 0,verbs,adjectives
0,[meeting],[]
1,"[sink, entering]",[]
2,"[get, underappreciated]","[closer, local]"
3,[empowers],"[able, beautiful]"
4,[],[big]
...,...,...
1546,"[gone, stripped]",[]
1547,"[requires, operates]","[higher, less]"
1548,[],"[several, alternative, manganese]"
1549,[paid],"[high, responsible]"


In [52]:
verbs_to_remove = ['get','are','is','am','have','has','been','seen','had','do','took','be',
                    'make','does','like','did','see','was','go','got','get','want','getting','gets', 'exist',
                    'done','doing','went','uses','says','known','let','given' ,'gave','makes','goes',
                    'gone','going','saw','being','were']

def remove_words(row):
    verbs_list = []

    if len(row) > 0:
        for i in row:
            if i not in verbs_to_remove:
                verbs_list.append(i)
    return verbs_list                  

df['verbs'] = df['verbs'].apply(remove_words)

In [54]:
adj_to_remove = ['many','most','much','such']

def remove_words(row):
    adj_list = []

    if len(row) > 0:
        for i in row:
            if i not in adj_to_remove:
                adj_list.append(i)
    return adj_list                  

df['adjectives'] = df['adjectives'].apply(remove_words)

In [56]:
noun_to_remove = ['ft', 'tbc', 'gt50' , '50year' , 'bro' , 'lt5', 'україна', 'thursday', '110k', 'friday' , 'tbh', 'qa', 'haha' ,
                 'ac', 'november', 'sept' , 'lhd', 'rhd', 'вы', 'bf' , 'fsd' , 'us', 'fud', 'gwhdaykm2', 'irs', 'lib', 'libs', 'mgmt',
                 'умом без сердца', 'fps', 'venn', 'cnc', 'hz' , 'def', '8k' , 'nighti', 'encanta el mariachi', 'forward10', 'ich',
                 'isp', 'comme ci comme', 'la', 'ludendorffs', 'july', 'ðoge', 'siegel', 'zaporizhzhia', 'h2', 'ron barron', 'sjm', 'doj',
                 'talulah', 'starlink eg league', 'areof pastpresentand future', 'mr president', 'bs', 'el camino jack', 'jb', 'nyt',
                 'честью', 'das bootdas baby', 'zukunft', 'gnus', 'yang', 'august', 'reps', 'btw', 'wsj', 'covid19 anymorei',
                 'ozempicrybelsus', 'дура', 'charlie ergen', 'dec', 'october', '1k', '100mw', 'mdma', 'un ass', 'donbas', 'gwynne',
                 'ppmgt1000', 'же несчастная дура', 'awe', 'mf', 'kbg', 'nn', 'ch4', 'kg', 'bf16', 'thingreal', 'uaw', 'pshaw',
                 'на этот', 'faa', 'row iv', 'ev', 'gm', 'kph', 'haha', 'b7', 'may', 'zatko', 'eu', 'un', 'nv', 'tw', 'gt95', 'mps',
                 'q2', 'ur', 'путин', 'agoford', 'leavei', 'vw', 'ferdinand piëch', 'giga berlinbrandenburg', 'satan', 'das baby',
                 'cape canaveralhumans', 'chad', 'hijinks']

def remove_words(row):
    noun_list = []

    if len(row) > 0:
        for i in row:
            if i not in noun_to_remove:
                noun_list.append(i)
    return noun_list                  

df['Noun_Keyphrases'] = df['Noun_Keyphrases'].apply(remove_words)                 

In [57]:
df.to_parquet('./data/processed_data.parquet', index=False)

In [58]:
from deep_translator import GoogleTranslator
to_translate = 'Ich mÃchte mich recht herzlich bedanken  Die Zukunft ist sehr spannend'
translated = GoogleTranslator(source='auto', target='en').translate(to_translate)
translated

'Thank you very much. The future is very exciting'