In [12]:
import sys
!{sys.executable} -m pip install textblob
!{sys.executable} -m pip install keras
!{sys.executable} -m pip install tensorflow



DATA LOADING

In [13]:
import pandas as pd
import nltk
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, KeyedVectors
from keras.models import Sequential
from keras.layers import Dense

import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('database.sqlite')

# Load the data from the 'May2015' table
query = "SELECT * FROM May2015 LIMIT 100000;"  # Loading 10,000 records as an example
df = pd.read_sql(query, conn)

# Close the connection
conn.close()


Data Preprocessing

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download necessary nltk resources (only need to run once)
# nltk.download('stopwords')
# nltk.download('punkt')

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())
    words = [stemmer.stem(word) for word in words if word not in stop_words and word.isalpha()]
    return ' '.join(words)

# Preprocessing steps
df = df.dropna(subset=['body'])  # Drop rows with missing comment text
df['body'] = df['body'].apply(preprocess_text)  # Preprocess text
df = df[df['body'].str.len() > 10]  # Filter out comments that are too short


3. Feature Engineering

Sentiment Analysis:


In [15]:
df['polarity'] = df['body'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['subjectivity'] = df['body'].apply(lambda x: TextBlob(x).sentiment.subjectivity)



Word2Vec Embeddings:
Load the model and get embeddings:

In [16]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(df['body'])
tfidf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

def get_average_word2vec(tokens_list, vector, tfidf_weights, k=300):
    if len(tokens_list) < 1:
        return np.zeros(k)
    vectorized = [vector[word] * tfidf_weights.get(word, 1) if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

# Load Word2Vec model
model_path = "GoogleNews-vectors-negative300.bin"
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

df['word2vec'] = df['body'].apply(lambda x: get_average_word2vec(x.split(), word2vec_model, tfidf_dict))


Model Training

In [17]:
df['target'] = df['score'].apply(lambda x: 1 if x > 0 else 0)
X = df[['polarity', 'subjectivity']]  # You can add other columns if needed
y = df['target'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



from sklearn.utils import class_weight

# Compute class weights
class_weights = class_weight.compute_class_weight('balanced', 
                                                  classes=[0, 1], 
                                                  y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
model = Sequential()
model.add(Dense(units=128, activation='relu', input_dim=X.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weights_dict)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x3ce789690>

In [18]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")


Loss: 0.6875523328781128, Accuracy: 0.6812565922737122


In [19]:
# Getting predictions
y_pred = model.predict(X_test)
y_pred = [1 if p >= 0.5 else 0 for p in y_pred]  # Converting probabilities to binary labels

# Computing evaluation metrics
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)


Accuracy: 0.6812565965108338

Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.33      0.14      1222
           1       0.93      0.71      0.80     14885

    accuracy                           0.68     16107
   macro avg       0.51      0.52      0.47     16107
weighted avg       0.86      0.68      0.75     16107



In [20]:
def predict_sentiment_fixed(polarity, subjectivity, ups, controversiality):
    # Prepare the input data
    input_data = np.array([[polarity, subjectivity, ups, controversiality]])
    
    # Get model prediction
    prediction = model.predict(input_data)
    
    # Convert probability to binary label
    result = "Positive" if prediction >= 0.5 else "Negative"
    
    return result

# Predict using the fixed function
result_fixed = predict_sentiment_fixed(-5, 5, 5, 1
                                    )
result_fixed


ValueError: in user code:

    File "/Users/sanks04/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 2341, in predict_function  *
        return step_function(self, iterator)
    File "/Users/sanks04/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 2327, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/sanks04/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 2315, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/sanks04/anaconda3/lib/python3.11/site-packages/keras/src/engine/training.py", line 2283, in predict_step
        return self(x, training=False)
    File "/Users/sanks04/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/sanks04/anaconda3/lib/python3.11/site-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 2), found shape=(None, 4)


In [None]:
print(model.input_shape)
