In [1]:
# Import necessary libraries
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#Load the Data
df = pd.read_csv('/content/tweets.csv')

In [4]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


Data Preprocessing

In [5]:
#Text Cleaning
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = text.strip()
    return text

In [7]:
# Apply the clean_text function to the text column
df['cleaned_tweet'] = df['tweet'].apply(clean_text)

In [9]:
# Display the cleaned text
df[['tweet', 'cleaned_tweet']].head()

Unnamed: 0,tweet,cleaned_tweet
0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test android apps beaut...
1,Finally a transparant silicon case ^^ Thanks t...,finally a transparant silicon case thanks to ...
2,We love this! Would you go? #talk #makememorie...,we love this would you go talk makememories un...
3,I'm wired I know I'm George I was made that wa...,im wired i know im george i was made that way ...
4,What amazing service! Apple won't even talk to...,what amazing service apple wont even talk to m...


In [11]:
#Tokenization
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
from nltk.tokenize import word_tokenize

In [15]:
def tokenize_text(text):
    # Tokenize the cleaned text
    tokens = word_tokenize(text)
    return tokens

In [16]:
# Apply the tokenize_text function to the cleaned_tweet column
df['tokenized_tweet'] = df['cleaned_tweet'].apply(tokenize_text)

In [17]:
# Display the tokenized text
df[['cleaned_tweet', 'tokenized_tweet']].head()

Unnamed: 0,cleaned_tweet,tokenized_tweet
0,fingerprint pregnancy test android apps beaut...,"[fingerprint, pregnancy, test, android, apps, ..."
1,finally a transparant silicon case thanks to ...,"[finally, a, transparant, silicon, case, thank..."
2,we love this would you go talk makememories un...,"[we, love, this, would, you, go, talk, makemem..."
3,im wired i know im george i was made that way ...,"[im, wired, i, know, im, george, i, was, made,..."
4,what amazing service apple wont even talk to m...,"[what, amazing, service, apple, wont, even, ta..."


In [19]:
#Removing Stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [21]:
from nltk.corpus import stopwords

In [22]:
stop_words = set(stopwords.words('english'))

In [23]:
def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

In [24]:
# Apply the remove_stopwords function to the tokenized_tweet column
df['filtered_tweet'] = df['tokenized_tweet'].apply(remove_stopwords)

In [25]:
# Display the filtered text
df[['tokenized_tweet', 'filtered_tweet']].head()

Unnamed: 0,tokenized_tweet,filtered_tweet
0,"[fingerprint, pregnancy, test, android, apps, ...","[fingerprint, pregnancy, test, android, apps, ..."
1,"[finally, a, transparant, silicon, case, thank...","[finally, transparant, silicon, case, thanks, ..."
2,"[we, love, this, would, you, go, talk, makemem...","[love, would, go, talk, makememories, unplug, ..."
3,"[im, wired, i, know, im, george, i, was, made,...","[im, wired, know, im, george, made, way, iphon..."
4,"[what, amazing, service, apple, wont, even, ta...","[amazing, service, apple, wont, even, talk, qu..."


In [27]:
#stemming
from nltk.stem import PorterStemmer

In [28]:
stemmer = PorterStemmer()

In [29]:
def stem_tokens(tokens):
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

In [30]:
# Apply the stem_tokens function to the filtered_tweet column
df['stemmed_tweet'] = df['filtered_tweet'].apply(stem_tokens)

In [31]:
# Display the stemmed text
df[['filtered_tweet', 'stemmed_tweet']].head()

Unnamed: 0,filtered_tweet,stemmed_tweet
0,"[fingerprint, pregnancy, test, android, apps, ...","[fingerprint, pregnanc, test, android, app, be..."
1,"[finally, transparant, silicon, case, thanks, ...","[final, transpar, silicon, case, thank, uncl, ..."
2,"[love, would, go, talk, makememories, unplug, ...","[love, would, go, talk, makememori, unplug, re..."
3,"[im, wired, know, im, george, made, way, iphon...","[im, wire, know, im, georg, made, way, iphon, ..."
4,"[amazing, service, apple, wont, even, talk, qu...","[amaz, servic, appl, wont, even, talk, questio..."


In [32]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [33]:
lemmatizer = WordNetLemmatizer()

In [34]:
def lemmatize_tokens(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

In [35]:
# Apply the lemmatize_tokens function to the stemmed_tweet column
df['lemmatized_tweet'] = df['stemmed_tweet'].apply(lemmatize_tokens)

In [36]:
# Display the lemmatized text
df[['stemmed_tweet', 'lemmatized_tweet']].head()

Unnamed: 0,stemmed_tweet,lemmatized_tweet
0,"[fingerprint, pregnanc, test, android, app, be...","[fingerprint, pregnanc, test, android, app, be..."
1,"[final, transpar, silicon, case, thank, uncl, ...","[final, transpar, silicon, case, thank, uncl, ..."
2,"[love, would, go, talk, makememori, unplug, re...","[love, would, go, talk, makememori, unplug, re..."
3,"[im, wire, know, im, georg, made, way, iphon, ...","[im, wire, know, im, georg, made, way, iphon, ..."
4,"[amaz, servic, appl, wont, even, talk, questio...","[amaz, servic, appl, wont, even, talk, questio..."


In [37]:
#Vectorization
# Since TF-IDF vectorizer expects a string, we need to join the tokens back into a string
df['lemmatized_tweet_str'] = df['lemmatized_tweet'].apply(lambda x: ' '.join(x))

In [39]:
# Split the data into training and testing sets
X = df['lemmatized_tweet_str']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [41]:
# Display the shape of the TF-IDF matrices
print(f'Training data shape: {X_train_tfidf.shape}')
print(f'Testing data shape: {X_test_tfidf.shape}')

Training data shape: (6336, 13095)
Testing data shape: (1584, 13095)


 Model Training

In [42]:
from sklearn.linear_model import LogisticRegression

In [43]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

Model Evaluation

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

In [46]:
print(f'Accuracy: {accuracy}')

Accuracy: 0.8705808080808081


Built a system that can accurately classify the new tweets
sentiments with Accuracy: 0.8705808080808081