In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saurabh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Saurabh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saurabh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
# Sample Data
data = {
    'id': [1, 2],
    'topic': ['Borderlands', 'Borderlands'],
    'sentiment': ['Positive', 'Negative'],
    'tweet': [
        "I'm getting on Borderlands and I will murder you all!",
        "This game is so bad, I can't believe I bought it."
    ]
}

df = pd.DataFrame(data)
print(df.head())

   id        topic sentiment  \
0   1  Borderlands  Positive   
1   2  Borderlands  Negative   

                                               tweet  
0  I'm getting on Borderlands and I will murder y...  
1  This game is so bad, I can't believe I bought it.  


In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove URLs, mentions, hashtags, non-alphabetic characters
    text = re.sub(r"http\S+|@\w+|#\w+|[^a-z\s]", "", text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords & lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['tweet'].apply(preprocess)
print(df[['tweet', 'clean_text']])

                                               tweet  \
0  I'm getting on Borderlands and I will murder y...   
1  This game is so bad, I can't believe I bought it.   

                     clean_text  
0  im getting borderland murder  
1  game bad cant believe bought  


In [4]:
# Encode sentiment labels
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])
print(df[['sentiment', 'sentiment_encoded']])

  sentiment  sentiment_encoded
0  Positive                  1
1  Negative                  0


In [5]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['clean_text'])

# Convert to DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
print(tfidf_df.head())

        bad   believe  borderland    bought      cant      game  getting   im  \
0  0.000000  0.000000         0.5  0.000000  0.000000  0.000000      0.5  0.5   
1  0.447214  0.447214         0.0  0.447214  0.447214  0.447214      0.0  0.0   

   murder  
0     0.5  
1     0.0  


In [6]:
# Save to CSV
df.to_csv("cleaned_tweets.csv", index=False)
tfidf_df.to_csv("tfidf_features.csv", index=False)
print("Files saved: 'cleaned_tweets.csv' and 'tfidf_features.csv'")

Files saved: 'cleaned_tweets.csv' and 'tfidf_features.csv'
