In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load training data
training_data = pd.read_csv('training_data.csv')

# Separate data based on labels
positive_samples = training_data[training_data['label'] == 1]
negative_samples = training_data[training_data['label'] == 0]

# Randomly select the same number of negative samples as positive samples
balanced_negative_samples = negative_samples.sample(n=len(positive_samples), random_state=42)

# Combine positive and balanced negative samples
balanced_training_data = pd.concat([positive_samples, balanced_negative_samples])

# Shuffle the balanced training data
balanced_training_data = balanced_training_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatization
    return ' '.join(tokens)

balanced_training_data['clean_text'] = balanced_training_data['tweet'].apply(preprocess_text)

# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(balanced_training_data['clean_text'])
y = balanced_training_data['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Model evaluation
y_pred = logistic_regression_model.predict(X_test)
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


              precision    recall  f1-score   support

           0       0.85      0.87      0.86       473
           1       0.85      0.83      0.84       424

    accuracy                           0.85       897
   macro avg       0.85      0.85      0.85       897
weighted avg       0.85      0.85      0.85       897



In [11]:
# Load testing data
testing_data = pd.read_csv('test_tweets.csv')

# Preprocessing for testing data
testing_data['clean_text'] = testing_data['tweet'].apply(preprocess_text)

# Transform testing data using the same TF-IDF vectorizer
X_test_new = tfidf_vectorizer.transform(testing_data['clean_text'])

# Predict sentiment for testing data
testing_data['sentiment'] = logistic_regression_model.predict(X_test_new)

# Print the testing data with predicted sentiments
print(testing_data)


          id                                              tweet  \
0      31963  #studiolife #aislife #requires #passion #dedic...   
1      31964   @user #white #supremacists want everyone to s...   
2      31965  safe ways to heal your #acne!!    #altwaystohe...   
3      31966  is the hp and the cursed child book up for res...   
4      31967    3rd #bihday to my amazing, hilarious #nephew...   
...      ...                                                ...   
17192  49155  thought factory: left-right polarisation! #tru...   
17193  49156  feeling like a mermaid ð #hairflip #neverre...   
17194  49157  #hillary #campaigned today in #ohio((omg)) &am...   
17195  49158  happy, at work conference: right mindset leads...   
17196  49159  my   song "so glad" free download!  #shoegaze ...   

                                              clean_text  sentiment  
0      studiolife aislife requires passion dedication...          0  
1      user white supremacist want everyone see new m..

In [9]:
num_unique_values = training_data.nunique()

# Print the number of unique values
print("Number of unique values in the dataset:")
print(num_unique_values)

# Get the frequency of each unique value
value_counts = training_data['label'].value_counts()

# Print the frequency of each unique value
print("\nFrequency of each unique value:")
print(value_counts)

Number of unique values in the dataset:
id            31962
label             2
tweet         29530
clean_text    29081
dtype: int64

Frequency of each unique value:
label
0    29720
1     2242
Name: count, dtype: int64
