In [None]:
import pandas as pd
import dateutil
import warnings
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

pd.set_option('display.max_colwidth', 1000)
warnings.filterwarnings('ignore')

from nltk.downloader import download as nltkDownload
nltkDownload('vader_lexicon')

In [None]:
dataset_df = pd.read_csv('../data/chatgpt_daily_tweets.csv')
dataset_en_df = dataset_df[dataset_df['lang'] == 'en']
dataset_en_df

In [None]:
def clean_data(dataset_df: pd.DataFrame) -> pd.DataFrame:
    try:
        data = dataset_df.copy()
        # Convert the 'date' column to datetime format and remove the timezone information
        data['tweet_created'] = data['tweet_created'].apply(lambda x: dateutil.parser.parse(x))
        data['tweet_created'] = pd.to_datetime(data['tweet_created']).dt.tz_localize(None)

        # data['tweet_extracted'] = data['tweet_extracted'].apply(lambda x: dateutil.parser.parse(x))
        # data['tweet_extracted'] = pd.to_datetime(data['tweet_extracted']).dt.tz_localize(None)

        # Text cleaning
        data['text'] = data['text'].str.lower()

        # Remove twitter handlers
        data['text'] = data['text'].apply(lambda x:re.sub('@[^\s]+','',x))

        # Remove hashtags
        data['text'] = data['text'].apply(lambda x:re.sub(r'\B#\S+','',x))

        # Remove URLS
        data['text'] = data['text'].apply(lambda x:re.sub(r"http\S+", "", x))

        # Remove all the special characters
        data['text'] = data['text'].apply(lambda x:' '.join(re.findall(r'\w+', x)))

        # Remove all single characters
        data['text'] = data['text'].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

        # Substituting multiple spaces with single space
        data['text'] = data['text'].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

        data = data.rename(columns={
            'text': 'original_tweet',
            'tweet_created': 'datetime'
        })

        return data
    except ... as e:
        # Catch all errors
        print(f'Caught Error: {e}')


In [None]:
data = clean_data(dataset_df)
data.head()

In [None]:
def label_sentiment(x:float):
    if x < -0.05 : return 'negative'
    if x > 0.35 : return 'positive'
    return 'neutral'

In [None]:
def perform_feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    # Sentiment intensity analyzer
    sia = SIA()
    data['sentiment'] = [sia.polarity_scores(x)['compound'] for x in data['original_tweet']]
    data['overall_sentiment'] = data['sentiment'].apply(label_sentiment)

    # Feature extraction from tweet text
    data['words'] = data['original_tweet'].apply(lambda x:re.findall(r'\w+', x ))
    data['words_count'] = data['words'].apply(len)
    data['sentence_length'] = data['original_tweet'].apply(len)
    data['hour'] = data['datetime'].apply(lambda x: x.hour)
    data['date'] = data['datetime'].apply(lambda x: x.date())
    data['month'] = data['datetime'].apply(lambda x: x.month)
    data['year'] = data['datetime'].apply(lambda x: x.year)

    return data
    

In [None]:
data_with_features = perform_feature_engineering(data)
data_with_features.head()