# Data Clean

In [None]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('output_tweets.csv')

# Step 1: Change 'created_at' column to a format that Excel can recognize
# Convert to datetime format (removing 'T' and 'Z' from the ISO timestamp)
df['created_at'] = pd.to_datetime(df['created_at'], format='%Y-%m-%dT%H:%M:%S.%fZ')

# Step 2: Modify 'edit_history_tweet_ids' column
# If there is only one code inside [' '], change to 0, otherwise change to 1
df['edit_history_tweet_ids'] = df['edit_history_tweet_ids'].apply(lambda x: 0 if len(eval(x)) == 1 else 1)

# Step 3: Modify 'referenced_tweet_types' column
# Convert based on the conditions provided
def convert_referenced_tweet_types(val):
    val = eval(val)  # Convert the string representation of the list to a list
    if not val:  # Empty list
        return 0
    elif val == ['replied_to']:
        return 1
    elif val == ['quoted']:
        return 2
    elif set(val) == {'quoted', 'replied_to'}:
        return 3
    return 0  # Default case, if unmatched

df['referenced_tweet_types'] = df['referenced_tweet_types'].apply(convert_referenced_tweet_types)

# Step 4: Remove rows where the 'text' column contains more than 5 hashtags (#)
df = df[df['text'].apply(lambda x: x.count('#') <= 5)]

# Step 4.5: Remove rows where the 'text' column contains the specific phrase
specific_text = "Tesla stocks are skyrocketing 🚀📈!"
#Tesla stocks are skyrocketing 🚀📈! Investors are already seeing big gains, what are you waiting for? Time to jump in and ride the wave!
df = df[~df['text'].str.contains(re.escape(specific_text), case=False, na=False)]
specific_text = "memecoin"
df = df[~df['text'].str.contains(re.escape(specific_text), case=False, na=False)]
specific_text = "bitcoin"
df = df[~df['text'].str.contains(re.escape(specific_text), case=False, na=False)]
specific_text = "Buy https"
df = df[~df['text'].str.contains(re.escape(specific_text), case=False, na=False)]
specific_text = "To place your order"
df = df[~df['text'].str.contains(re.escape(specific_text), case=False, na=False)]
# Step 5: Save the modified dataframe to a new CSV file
df.to_csv('processed_output_tweets.csv', index=False, encoding='utf-8-sig')

print("CSV processing complete.")

In [None]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('output_users.csv')

# Convert the 'created_at' column to a more Excel-friendly datetime format
df['created_at'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')

# Convert the 'verified' column: True -> 1, False -> 0
df['verified'] = df['verified'].apply(lambda x: 1 if x == True else 0)
df = df.drop_duplicates(subset='id')

# Save the modified DataFrame to a new CSV file
df.to_csv('processed_output_users.csv', index=False, encoding='utf-8-sig')

print("Data processing complete.")

## Combination

In [None]:
import pandas as pd

# Read CSV files
tweets_df = pd.read_csv('processed_output_tweets.csv')
users_df = pd.read_csv('processed_output_users.csv')
users_df = users_df.drop_duplicates(subset='id')
# Print the size of the original DataFrames
print(f"Tweets DataFrame Size: {tweets_df.shape}")
print(f"Users DataFrame Size: {users_df.shape}")

# Perform a left join to merge the two DataFrames using author_id and id
merged_df = pd.merge(tweets_df, users_df, left_on='author_id', right_on='id', how='left')
merged_df = merged_df.drop(columns=['id_y'])
merged_df = merged_df.drop_duplicates()
# Print the size of the merged DataFrame
print(f"Merged DataFrame Size: {merged_df.shape}")

# Save the merged DataFrame to a new CSV file with utf-8-sig encoding
merged_df.to_csv('Cleaned_merged_data.csv', index=False, encoding='utf-8-sig')
print("Data processing complete.")

In [None]:
import pandas as pd
import re

# Read data
df = pd.read_csv('Cleaned_merged_data.csv')
text = df['text'].iloc[1:].astype(str)

# Print type and content for debugging
print(f"Data type: {type(text)}")
print(text.head())

# Define function for text preprocessing
def preprocess_text(t):
    # Replace newline characters with a space to ensure each tweet occupies one line
    
    # Replace HTML entities
    t = t.replace("&nbsp;", " ")
    t = t.replace("&lt;", "<")
    t = t.replace("&gt;", ">")
    t = t.replace("&amp;", "&")
    t = t.replace("&quot;", "")
    t = t.replace("&apos;", "")
    t = t.replace("&times;", "×")
    t = t.replace("&divide;", "÷")

    # Replace hashtags
    t = re.sub(r'#(\w+)', r'HASH_\1', t)

    # Replace handles
    t = re.sub(r'@(\w+)', r'HNDL_\1', t)

    # Replace URLs
    t = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\./]+', 'URL', t)

    # Replace emoticons
    emoticon_dict = {
        r'[:-]?\)|:\)|\(:|\(-:': 'EMOT_SMILEY',  # Smiley face
        r'[:-]?D|:D|X-D|XD|xD': 'EMOT_LAUGH',   # Laughter
        r'<3|:\*': 'EMOT_LOVE',                # Heart
        r';-?\)|;\)|;-?D|;D|\(-;|;\(': 'EMOT_WINK',  # Wink
        r'[:-]?\(|:\(': 'EMOT_FROWN',          # Frown
        r',\(|:\'|:"|\(\(': 'EMOT_CRY'         # Crying
    }
    for emoticon, replacement in emoticon_dict.items():
        t = re.sub(emoticon, replacement, t)

    # Replace punctuation
    t = re.sub(r'([!.¡¿])', r' PUNC_EXCL ', t)  # Replace exclamation marks
    t = re.sub(r'\.', ' PUNC_DOT ', t)          # Replace periods
    t = re.sub(r'\?', ' PUNC_QUES ', t)        # Replace question marks
    t = re.sub(r'\.\.\.|…', ' PUNC_ELLP ', t)  # Replace ellipses

    # Replace repeated characters
    t = re.sub(r'(.)\1{1,}', r'\1\1', t)

    # Remove extra single quotes and other unnecessary characters
    t = re.sub(r'[\'“”]', '', t)
    t = t.strip()

    return t

# Apply text preprocessing
try:
    processed_texts = df['text'].apply(preprocess_text)
    df['text'] = processed_texts  # Update the text column in the original DataFrame
    print(df.head())
except Exception as e:
    print(f"Error occurred during preprocessing: {e}")
    processed_texts = None  # Ensure the variable is defined

# Check if processed_texts is successfully defined
if processed_texts is not None:
    # Save the results to a new CSV file
    df.to_csv('processed_cleaned_data.csv', index=False, encoding='utf-8-sig')

    print("Processing completed, results saved to 'processed_cleaned_data1.csv'")
else:
    print("Processing failed, no results saved.")

# Classification (Language)

In [None]:
import pandas as pd
from langid.langid import LanguageIdentifier, model
from collections import defaultdict

# Give Input/Output file paths
input_file = "processed_cleaned_data.csv"  # Provide your own

# Set confidence_threshold
confidence_threshold = 0.9  # In our setting, the default is 0.9

tweets = pd.read_csv("processed_cleaned_data.csv")
tweets = tweets['text']
# Store trusted tweets
trusted_tweets = []
# Dictionary for counting languages
language_count = defaultdict(int)

# Initialize LanguageIdentifier
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

# Iterate over each tweet
for tweet in tweets:
    text = tweet  # Already a string, no need to call get
    
    # Use langid.py for language identification
    lang, confidence = identifier.classify(text)
    # Count languages
    language_count[lang] += 1
    
    # Check if the language is English and confidence is above the threshold
    if lang == 'en' and confidence >= confidence_threshold:
        trusted_tweets.append({'text': text, 'language': lang})  # Store tweet and language information

# Print counts of different languages
print("Language counts:")
for lang, count in language_count.items():
    print(f"{lang}: {count}")
print(f"We got {len(trusted_tweets)} trusted.")

# Tokenization

In [None]:
import pandas as pd
from textblob import TextBlob

# Read data
df = pd.read_csv("processed_cleaned_data.csv")

# Define a function for tokenization
def tokenize_text(text):
    # Use TextBlob for tokenization
    blob = TextBlob(text)
    
    return blob.words  # Return the list of tokens

# Apply the function to each row of text and create a new column 'token'
df['token'] = df['text'].apply(tokenize_text)

# Print the first few rows to confirm
print(df[['text', 'token']].head())

# Save the results to a new CSV file
output_file = "tokenized_data.csv"
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Tokenization completed and saved to '{output_file}'")

# Normalization

In [None]:
import pandas as pd
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Paths for input and output files
input_file = "tokenized_data.csv"  # Provide your own
output_file = "normalized_data.csv"

# Create Normalization Lexicon
normalization_lexicon = {
    "u": "you",
    "r": "are",
    "gr8": "great",
    "b4": "before",
    "l8r": "later",
    "cuz": "because",
    "pls": "please",
    "thx": "thanks",
    "smh": "shaking my head",
    "shuld": "should",
    "4": "for",
    "lol": "laugh out loud",
    "sucha": "such a",
    "ain’t": "are not",
    "’s": "is",
    "aren’t": "are not",
    "can’t": "cannot",
    "can’t’ve": "cannot have",
    "’cause": "because",
    "could’ve": "could have",
    "couldn’t": "could not",
    "couldn’t’ve": "could not have",
    "didn’t": "did not",
    "doesn’t": "does not",
    "don’t": "do not",
    "hadn’t": "had not",
    "hadn’t’ve": "had not have",
    "hasn’t": "has not",
    "haven’t": "have not",
    "he’d": "he would",
    "he’d’ve": "he would have",
    "he’ll": "he will",
    "he’ll’ve": "he will have",
    "how’d": "how did",
    "how’d’y": "how do you",
    "how’ll": "how will",
    "I’d": "I would",
    "I’d’ve": "I would have",
    "I’ll": "I will",
    "I’ll’ve": "I will have",
    "I’m": "I am",
    "I’ve": "I have",
    "isn’t": "is not",
    "it’d": "it would",
    "it’d’ve": "it would have",
    "it’ll": "it will",
    "it’ll’ve": "it will have",
    "let’s": "let us",
    "ma’am": "madam",
    "mayn’t": "may not",
    "might’ve": "might have",
    "mightn’t": "might not",
    "mightn’t’ve": "might not have",
    "must’ve": "must have",
    "mustn’t": "must not",
    "mustn’t’ve": "must not have",
    "needn’t": "need not",
    "needn’t’ve": "need not have",
    "o’clock": "of the clock",
    "oughtn’t": "ought not",
    "oughtn’t’ve": "ought not have",
    "shan’t": "shall not",
    "sha’n’t": "shall not",
    "shan’t’ve": "shall not have",
    "she’d": "she would",
    "she’d’ve": "she would have",
    "she’ll": "she will",
    "she’ll’ve": "she will have",
    "should’ve": "should have",
    "shouldn’t": "should not",
    "shouldn’t’ve": "should not have",
    "so’ve": "so have",
    "that’d": "that would",
    "that’d’ve": "that would have",
    "there’d": "there would",
    "there’d’ve": "there would have",
    "they’d": "they would",
    "they’d’ve": "they would have",
    "they’ll": "they will",
    "they’ll’ve": "they will have",
    "they’re": "they are",
    "they’ve": "they have",
    "to’ve": "to have",
    "wasn’t": "was not",
    "we’d": "we would",
    "we’d’ve": "we would have",
    "we’ll": "we will",
    "we’ll’ve": "we will have",
    "we’re": "we are",
    "we’ve": "we have",
    "weren’t": "were not",
    "what’ll": "what will",
    "what’ll’ve": "what will have",
    "what’re": "what are",
    "what’ve": "what have",
    "when’ve": "when have",
    "where’d": "where did",
    "where’ve": "where have",
    "who’ll": "who will",
    "who’ll’ve": "who will have",
    "who’ve": "who have",
    "why’ve": "why have",
    "will’ve": "will have",
    "won’t": "will not",
    "won’t’ve": "will not have",
    "would’ve": "would have",
    "wouldn’t": "would not",
    "wouldn’t’ve": "would not have",
    "y’all": "you all",
    "y’all’d": "you all would",
    "y’all’d’ve": "you all would have",
    "y’all’re": "you all are",
    "y’all’ve": "you all have",
    "you’d": "you would",
    "you’d’ve": "you would have",
    "you’ll": "you will",
    "you’ll’ve": "you will have",
    "you’re": "you are",
    "you’ve": "you have"
}

# Load English stop words
stop_words = set(stopwords.words('english'))

def normalize_and_stem_tokens(tokens, lexicon):
    stemmer = PorterStemmer()  # Initialize PorterStemmer
    
    # Filter out 'URL' tokens and tokens containing 'hash_' or 'punc_'
    tokens = [
        token for token in tokens 
        if token != "URL" and "HASH_" not in token and "PUNC_" not in token and "EMOT_" not in token
    ]
    
    normalized_tokens = []
    for token in tokens:
        token = token.lower()  # Convert to lowercase
        token = re.sub(r'[^\w]', '', token)  # Remove non-word characters
        normalized_token = lexicon.get(token, token)  # Normalize using lexicon
        
        # Check for non-empty tokens and if it's not a stop word
        if normalized_token and normalized_token not in stop_words:
            stemmed_token = stemmer.stem(normalized_token)  # Apply stemming
            normalized_tokens.append(stemmed_token)
    
    # Remove empty tokens from the final list
    normalized_tokens = [token for token in normalized_tokens if token]
    
    return normalized_tokens

# Read CSV file
df = pd.read_csv(input_file)

# Process each tweet
normalized_tokens_list = []

for index, row in df.iterrows():
    tokens = row['token']  # Get the tokens from the 'token' column
    if isinstance(tokens, str):
        tokens = eval(tokens)  # Convert string representation of list back to list

    # Normalize and stem tokens
    normalized_tokens = normalize_and_stem_tokens(tokens, normalization_lexicon)
    
    # Store the normalized tokens
    normalized_tokens_list.append(normalized_tokens)

# Add the normalized tokens as a new column in the DataFrame
df['normalization'] = normalized_tokens_list

# Save the updated DataFrame to a new CSV file
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Normalization completed and saved to '{output_file}'")