In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Go to project folder
import os
os.chdir('/content/drive/MyDrive/ChatGpt-Sentiment-Analysis/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Import Libraries**

In [None]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# **Loading the dataframe**

In [None]:
#reading the dataframe
df = pd.read_csv("Data/chatgpt.csv")
print(df.head())

   Unnamed: 0                                             tweets   labels
0           0  ChatGPT: Optimizing Language Models for Dialog...  neutral
1           1  Try talking with ChatGPT, our new AI system wh...     good
2           2  ChatGPT: Optimizing Language Models for Dialog...  neutral
3           3  THRILLED to share that ChatGPT, our new model ...     good
4           4  As of 2 minutes ago, @OpenAI released their ne...      bad


In [None]:
#droping the unnamed columns
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
#Checking if column is drop or not
print(df)

                                                   tweets   labels
0       ChatGPT: Optimizing Language Models for Dialog...  neutral
1       Try talking with ChatGPT, our new AI system wh...     good
2       ChatGPT: Optimizing Language Models for Dialog...  neutral
3       THRILLED to share that ChatGPT, our new model ...     good
4       As of 2 minutes ago, @OpenAI released their ne...      bad
...                                                   ...      ...
219289  Other Software Projects Are Now Trying to Repl...      bad
219290  I asked #ChatGPT to write a #NYE Joke for SEOs...     good
219291  chatgpt is being disassembled until it can onl...      bad
219292  2023 predictions by #chatGPT. Nothing really s...      bad
219293   From ChatGPT, neat stuff https://t.co/qjjUF2Z2m0  neutral

[219294 rows x 2 columns]


In [None]:
#Viewing the first 5  rows in tweets column
def preview_tweets(df, n=5, column='tweets'):
    for tweet in df[column].head(n):
        print(tweet)

In [None]:
preview_tweets(df)

chatgpt: optimizing language models for dialogue
try talking with chatgpt, our new ai system which is optimized for dialogue. your feedback will help us improve it.
chatgpt: optimizing language models for dialogue \n\ntrending ai/ml article identified &amp; digested via granola; a machine-driven rss bot by ramsey elbasheer
thrilled to share that chatgpt, our new model optimized for dialog, is now public, free, and accessible to everyone.
as of 2 minutes ago, released their new chatgpt. \n\nand you can use it right now üëá


 # **Removing HTML Tags**

In [None]:
import re

In [None]:
def clear_social_text(text: str) -> str:
    text = str(text).lower()

    text = re.sub(r'http\S+|www\S+', '', text)   # Remove URLs (http, https, www)


    text = re.sub(r'@\w+', '', text)  # Remove mentions (@username)


    text = re.sub(r'#\w+', '', text) # Remove hashtags (#hashtag)


    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespaces

    return text



In [None]:
df['tweets'] = df['tweets'].apply(clear_social_text)

In [None]:
preview_tweets(df)

chatgpt: optimizing language models for dialogue
try talking with chatgpt, our new ai system which is optimized for dialogue. your feedback will help us improve it.
chatgpt: optimizing language models for dialogue \n\ntrending ai/ml article identified &amp; digested via granola; a machine-driven rss bot by ramsey elbasheer
thrilled to share that chatgpt, our new model optimized for dialog, is now public, free, and accessible to everyone.
as of 2 minutes ago, released their new chatgpt. \n\nand you can use it right now üëá


# **Remove the non-printing characters from text**

In [None]:
import string
import re
from collections import Counter

Checking if there is non printing characters or not

In [None]:
#Checking if there is non printing characters or not
def has_non_printing(text: str) -> bool:
    return any(ch not in string.printable for ch in text)

In [None]:
df['has_non_printing'] = df['tweets'].apply(has_non_printing)

df[df['has_non_printing']].head()

Unnamed: 0,tweets,labels,has_non_printing
4,"as of 2 minutes ago, released their new chatgp...",bad,True
17,models are set to become the search engines of...,bad,True
28,chatgpt: optimizing language models for dialog...,neutral,True
34,openai's new chatgpt is very honest üòÄ,good,True
36,ok so 's new can basically just generate promp...,neutral,True


Listing  non-printing characters found in text

In [None]:
#listing  non-printing characters found in text
def extract_non_printing(text: str):
    return [ch for ch in str(text) if ch not in string.printable]

In [None]:
non_printing_chars = Counter()
for text in df['tweets']:
    non_printing_chars.update(extract_non_printing(text))

In [None]:
#Viewing the top common non printing chars
non_printing_chars.most_common(10)

[('‚Äô', 30896),
 ('‚Ä¶', 7643),
 ('‚Äú', 6406),
 ('‚Äù', 6178),
 ('üòÇ', 3590),
 ('Ô∏è', 3497),
 ('ü§Ø', 3494),
 ('‚Äî', 3087),
 ('‚Äò', 2307),
 ('üëá', 2289)]

In [None]:
for ch, count in non_printing_chars.most_common(10):
    print(f"U+{ord(ch):04X} | {repr(ch)} | count={count}")

U+2019 | '‚Äô' | count=30896
U+2026 | '‚Ä¶' | count=7643
U+201C | '‚Äú' | count=6406
U+201D | '‚Äù' | count=6178
U+1F602 | 'üòÇ' | count=3590
U+FE0F | 'Ô∏è' | count=3497
U+1F92F | 'ü§Ø' | count=3494
U+2014 | '‚Äî' | count=3087
U+2018 | '‚Äò' | count=2307
U+1F447 | 'üëá' | count=2289


Targeted cleaning (only remove what exists)

In [None]:
def remove_known_non_printing(text: str):
    for ch in non_printing_chars:
        text = text.replace(ch, '')
    return text

In [None]:
df['tweets'] = df['tweets'].astype(str).apply(remove_known_non_printing)

In [None]:
df['tweets'].apply(has_non_printing).value_counts()

Unnamed: 0_level_0,count
tweets,Unnamed: 1_level_1
False,219294


# **Removing punctuation,stopwords, whitespace and numbers**

Importing Libraries

In [None]:
import nltk
from nltk.corpus import stopwords
import re

In [None]:
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Set of English stopwords
STOP_WORDS = set(stopwords.words('english'))
PUNCT_REGEX = re.compile(f"[{re.escape(string.punctuation)}]") #remove punctuation
WHITESPACE_REGEX = re.compile(r'\s+') #remove whitespace
NUMBER_REGEX = re.compile(r'\d+') #remove number

In [None]:
def clean_tweet(text: str) -> str:
    text = str(text).lower()
    text = PUNCT_REGEX.sub("", text)
    text = NUMBER_REGEX.sub("", text)
    text = " ".join(word for word in text.split() if word not in STOP_WORDS)
    text = WHITESPACE_REGEX.sub(" ", text).strip()
    return text

In [None]:
df['tweets'] = df['tweets'].apply(clean_tweet)

In [None]:
preview_tweets(df)

chatgpt optimizing language models dialogue
try talking chatgpt new ai system optimized dialogue feedback help us improve
chatgpt optimizing language models dialogue nntrending aiml article identified amp digested via granola machinedriven rss bot ramsey elbasheer
thrilled share chatgpt new model optimized dialog public free accessible everyone
minutes ago released new chatgpt nnand use right


# **Word Lemmatizer**

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_tweet(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Split text into words, lemmatize each, join back into a string
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])


In [None]:
df['tweets'] = df['tweets'].apply(lemmatize_tweet)

# **Tokenization**

In [None]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
def tokenize_tweet(text: str) -> list:
    if not isinstance(text, str):
        return []

    return word_tokenize(text)

In [None]:
df['tweet_tokens'] = df['tweets'].apply(tokenize_tweet)

# **Saving to new csv**

In [None]:
print(df.head())

                                              tweets   labels  \
0         chatgpt optimizing language model dialogue  neutral   
1  try talking chatgpt new ai system optimized di...     good   
2  chatgpt optimizing language model dialogue nnt...  neutral   
3  thrilled share chatgpt new model optimized dia...     good   
4    minute ago released new chatgpt nnand use right      bad   

   has_non_printing                                       tweet_tokens  
0             False   [chatgpt, optimizing, language, model, dialogue]  
1             False  [try, talking, chatgpt, new, ai, system, optim...  
2             False  [chatgpt, optimizing, language, model, dialogu...  
3             False  [thrilled, share, chatgpt, new, model, optimiz...  
4              True  [minute, ago, released, new, chatgpt, nnand, u...  


In [None]:
print(df.columns.to_list())

['tweets', 'labels', 'has_non_printing', 'tweet_tokens']


In [None]:
df_to_save = df.drop(columns=['has_non_printing'])

In [None]:
df_to_save.to_csv('Data/chatgpt_v1.csv', index=False)