In [1]:
use_google_colab = False


#set if we want to clean the or load the precleaned data
clean_data_again = True
# set a debug mode
debug = True

In [2]:
if debug and clean_data_again:
  #clean_data_again = False
  print("Warning: debug mode is on and clean_data_again has been reset to False.")

if use_google_colab :
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/ColabNotebooks/sentiMentaL_tweets



In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Adjust the path to point to the parent directory
parent_dir = os.path.dirname(current_dir)

# Add the parent directory to sys.path
sys.path.insert(0, parent_dir)


import utils.vocab_manip as vm
import utils.hashtag_dealing_methods as hdm
import utils.pooling as po
import utils.submission as sub

In [4]:
### DATA LOADING
# Load the word embeddings
word_embeddings = np.load('../embeddings.npy')
df_word_embeddings = pd.DataFrame(word_embeddings)
print('Word embeddings shape:', word_embeddings.shape)
df_word_embeddings.head(1)


Word embeddings shape: (101298, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.024581,-0.109968,0.023675,-0.001796,0.093022,-0.010099,-0.025911,-0.012369,0.002293,0.036365,0.031491,-0.009959,-0.043053,-0.032535,0.010818,-0.005218,0.056776,0.025959,-0.095626,0.007661


In [5]:
# Load the test set tweets
with open('../twitter-datasets/test_data.txt', 'r', encoding='utf-8') as file:
    test_tweets = file.readlines()
    df_test_tweets = pd.DataFrame(test_tweets)

print('Test tweets:', df_test_tweets.head(1))

Test tweets:                                                    0
0  1,sea doo pro sea scooter ( sports with the po...


In [6]:
# Load the vocabulary
with open('../vocab_cut.txt', 'r', encoding='utf-8') as file:
    vocabulary = file.read().splitlines()

print('Vocabulary:', vocabulary[:20])

Vocabulary: ['<user>', '!', 'i', 'the', '.', ',', 'to', 'you', '(', '<url>', 'a', '...', 'and', 'my', 'me', 'of', '?', 'is', 'for', 'in']


In [7]:
# Create a dictionary to map words to their corresponding embeddings
word_to_embedding = {word: word_embeddings[i] for i, word in enumerate(vocabulary)}
df_word_embeddings = pd.DataFrame(word_to_embedding)
print('Word to embedding shape:', df_word_embeddings.shape)
df_word_embeddings.head(1)



Word to embedding shape: (20, 101298)


Unnamed: 0,<user>,!,i,the,.,",",to,you,(,<url>,...,#137-1,#122,#118,#115,#10persons,#0804releases,#07,#03,#001,
0,-0.024581,-0.017011,-0.029339,-0.004716,-0.021129,-0.020288,-0.034306,-0.071769,0.007265,-0.009445,...,-0.835783,-0.490954,0.003427,-0.072939,-1.144369,-0.222911,1.606001,1.19943,-2.057995,0.621154


In [8]:
# Load positive training tweets and assign labels
with open('../twitter-datasets/train_pos_full.txt', 'r', encoding='utf-8') as file:
    pos_tweets = file.readlines()

pos_labels = np.ones(len(pos_tweets), dtype=int)  # Assign label 1 for positive tweets
df_pos_tweets = pd.DataFrame(pos_tweets)

print('Positive tweets shape:', df_pos_tweets.shape)

df_pos_tweets.head(1)

Positive tweets shape: (1250000, 1)


Unnamed: 0,0
0,<user> i dunno justin read my mention or not ....


In [9]:
# Load negative training tweets and assign labels
with open('../twitter-datasets/train_neg_full.txt', 'r', encoding='utf-8') as file:
    neg_tweets = file.readlines()

neg_labels = -1 * np.ones(len(neg_tweets), dtype=int)  # Assign label -1 for negative tweets
df_neg_tweets = pd.DataFrame(neg_tweets)

print('Negative tweets shape:', df_neg_tweets.shape)
df_neg_tweets.head(1)

Negative tweets shape: (1250000, 1)


Unnamed: 0,0
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...


In [10]:

# Combine positive and negative tweets and labels
df_pos_tweets = pd.DataFrame(pos_tweets)
df_neg_tweets = pd.DataFrame(neg_tweets)

print("positive tweets shape:", df_pos_tweets.shape)
print("negative tweets shape:", df_neg_tweets.shape)
df_pos_tweets.head(1)

positive tweets shape: (1250000, 1)
negative tweets shape: (1250000, 1)


Unnamed: 0,0
0,<user> i dunno justin read my mention or not ....


#### End of the loading

___________________________

In [11]:
if debug:
    pos_tweets = pos_tweets[:10]
    neg_tweets = neg_tweets[:10]
    test_tweets = test_tweets[:10]
    #all_tweets = np.concatenate((train_tweets, test_tweets), axis=0)
    pos_labels = pos_labels[:10]
    neg_labels = neg_labels[:10]
    vocabulary = vocabulary[:100]

In [12]:
# Should be 2 times more tweets than labels
print('All tweets size:', len(pos_tweets)+ len(neg_tweets)+ len(test_tweets))
print('All labels size:', len(pos_labels)+ len(neg_labels))

All tweets size: 30
All labels size: 20


In [13]:
#reorder the vocabulary and the word embeddings according to the largest number of occurences first
vocabulary, word_embeddings = vm.reorder_vocabulary(pos_tweets, neg_tweets, test_tweets, vocabulary, word_embeddings, clean_data_again, save_counts=True)

In [14]:
# remove hashtags that are not in the vocabulary
pos_tweets, neg_tweets, test_tweets = hdm.process_tweets_hashtags(pos_tweets, neg_tweets, test_tweets, vocabulary, clean_data_again)

In [15]:
#save the words that are not in the vocabulary
vm.out_of_vocab_file(pos_tweets, neg_tweets, test_tweets, vocabulary, clean_data_again)

In [16]:
print("Vocab shape:", len(vocabulary))

Vocab shape: 100


In [17]:
### TRAINING THE LINEAR CLASSIFIER

pooling_method = "weigth" # "mean", "max", "tfidf", "weigth"

train_features, test_features = po.get_features(pooling_method, pos_tweets, neg_tweets, test_tweets, word_to_embedding, vocabulary, clean_data_again)
# Split the data into training and validation sets
labels = np.concatenate((pos_labels, neg_labels), axis=0)


train_features = np.array(train_features)
labels = np.array(labels)
# Assuming train_features and labels are NumPy arrays
assert len(train_features) == len(labels), "Features and labels must be of the same length"

# Generate a permutation of indices
shuffled_indices = np.random.permutation(len(train_features))

# Apply the shuffled indices to both features and labels
shuffled_features = train_features[shuffled_indices]
shuffled_labels = labels[shuffled_indices]



X_train, X_val, y_train, y_val = train_test_split(train_features, labels, test_size=0.1, random_state=42)

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Validate
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

Validation Accuracy: 0.5


In [18]:
print('Test features:', len(test_features))

Test features: 10


In [19]:
### LINEAR CLASSIFIER PREDICTIONS

# Construct feature representations for test tweets

# Make predictions
y_test_pred = model.predict(test_features)

test_data_path = "../twitter-datasets/test_data.txt"
ids_test = sub.get_test_ids(test_data_path)
print(y_test_pred)
y_pred = []
y_pred = y_test_pred
y_pred[y_pred <= 0] = -1
y_pred[y_pred > 0] = 1
sub.create_csv_submission(ids_test, y_pred, "../submissions/submission_"+pooling_method+"_pooling_and_regression.csv")

[ 1 -1 -1  1  1  1  1  1 -1 -1]
