In [1]:
import sys
!{sys.executable} -m pip install textblob
!{sys.executable} -m pip install keras
!{sys.executable} -m pip install tensorflow



In [3]:
import pandas as pd
import nltk
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, KeyedVectors
from keras.models import Sequential
from keras import regularizers
from keras.layers import Dense, Dropout
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import class_weight
import sqlite3

# Load Data
conn = sqlite3.connect('database.sqlite')
query = "SELECT * FROM May2015 LIMIT 500000;"  # Load a larger portion of your data
df = pd.read_sql(query, conn)
conn.close()

# Preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalpha()]
    return ' '.join(words)

df = df.dropna(subset=['body'])
df['body'] = df['body'].apply(preprocess_text)
df = df[df['body'].str.len() > 10]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanks04/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sanks04/nltk_data...


In [4]:
# Sentiment Analysis
df['polarity'] = df['body'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['subjectivity'] = df['body'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

# Word2Vec Embeddings
tfidf = TfidfVectorizer()
tfidf.fit(df['body'])
tfidf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

def get_average_word2vec(tokens_list, vector, tfidf_weights, k=300):
    if len(tokens_list) < 1:
        return np.zeros(k)
    vectorized = [vector[word] * tfidf_weights.get(word, 1) if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

# Assuming you have the model path
model_path = "GoogleNews-vectors-negative300.bin"
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
df['word2vec'] = df['body'].apply(lambda x: get_average_word2vec(x.split(), word2vec_model, tfidf_dict))


In [7]:
df['target'] = df['score'].apply(lambda x: 1 if x > 0 else 0)
X = df[['polarity', 'subjectivity']]  # Include Word2Vec embeddings as needed
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compute class weights
class_weights = class_weight.compute_class_weight('balanced', classes=[0, 1], y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

# Neural Network Model
model = Sequential()
model.add(Dense(units=128, activation='relu', kernel_regularizer=regularizers.l2(0.01), input_dim=X.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weights_dict)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x417bc10d0>

In [8]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred]

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)


Accuracy: 0.0804331200912854

Classification Report:
               precision    recall  f1-score   support

           0       0.08      1.00      0.15      6626
           1       0.00      0.00      0.00     75753

    accuracy                           0.08     82379
   macro avg       0.04      0.50      0.07     82379
weighted avg       0.01      0.08      0.01     82379



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def predict_sentiment(polarity, subjectivity):
    input_data = np.array([[polarity, subjectivity]])
    prediction = model.predict(input_data)
    result = "Positive" if prediction >= 0.5 else "Negative"
    return result

result = predict_sentiment(-0.5, 0.5)
print(result)
