In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
import regex as re
from nltk.tokenize import word_tokenize
!pip install gensim
import gensim.downloader as api
from gensim.models import Word2Vec
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.multioutput import MultiOutputClassifier
nltk.download('stopwords')  # Download stopwords list if not already present
from sklearn.decomposition import PCA

# Assuming 'corpus' is a list of lists, where each inner list contains words from a document
stop_words = set(stopwords.words('english'))  # Create a set for efficient lookup

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




# **Word Embeddings with Twitter Data**

In [None]:
df = pd.read_csv("twitter_training.csv")
df['Positive'].value_counts() #Here we can see the different outputs

#Create new column with appropriate name
df['Sentiment'] = df['Positive']
df = df.drop(['Positive', '2401', 'Borderlands'], axis=1)

In [None]:
df.tail(10)

In [None]:
df = df.rename(columns={"im getting on borderlands and i will murder you all ,":"Content"})

# **Put words in Reviews into Tokens**

In [None]:
df['Content'] = df['Content'].astype(str)
df['Content'] = df['Content'].apply(word_tokenize)

# **Run Word2Vec Model on Data to place into Vectors**

In [None]:
model = Word2Vec(df['Content'], window=5, vector_size=100, sg=1)

# **Show each words 100 dimension Vector Space**

In [None]:
vectors = [model.wv[token] for row in df['Content'] for token in row if token in model.wv]

#model.wv[token] = any word and its vector space

In [None]:
vectors

# **Calculate Average Vector Position across all Words in Review**

In [None]:
# Calculate average vector for each review
document_vectors = []
for row in df['Content']:
    row_vectors = [model.wv[token] for token in row if token in model.wv]
    if row_vectors:
        document_vectors.append(np.mean(row_vectors, axis=0))
    else:
        document_vectors.append(np.zeros(model.vector_size))  # Handle cases with no valid tokens

df['Vectors'] = document_vectors

In [None]:
df['Vectors']

In [None]:
df['Sentiment'].value_counts()

# **One-Hot Encoder**

In [None]:
### USE ONE HOT ENCODER ###

encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(df[['Sentiment']])

In [None]:
X = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Sentiment']))
y = df['Vectors']
X

# **Split Data into Train and Test**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

# **Run ML model**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

# Assuming y_train contains arrays and you want to extract the first element of each array
y_train_updated = np.array([arr[0] for arr in y_train])
y_train_updated = np.array([1 if arr[0] > 0.5 else 0 for arr in y_train])
clf.fit(X_train, y_train_updated)
clf.predict(X_test)

# Convert y_test to a NumPy array with binary labels (adjust threshold as needed)
y_test_updated = np.array([1 if arr[0] > 0.5 else 0 for arr in y_test])
clf.score(X_test, y_test_updated)

# **Dimensionality Reduction using PCA**

In [None]:
### Dimensionality Reduction for plotting ###

vector_data = np.vstack(df['Vectors'].to_numpy())

pca = PCA(n_components = 3)

threedvec = pca.fit_transform(vector_data)

In [None]:
# Create new columns in your DataFrame for each of the 4 components
df['threedvec_1'] = threedvec[:, 0]
df['threedvec_2'] = threedvec[:, 1]
df['threedvec_3'] = threedvec[:, 2]

# **Plot showing Vector position of Review**

In [None]:
fig = plt.figure(figsize=(10, 8))  # Adjust figure size as needed
ax = fig.add_subplot(111, projection='3d')

# Define a color palette for the sentiment categories
palette = sns.color_palette("tab10", len(df['Sentiment'].cat.categories))

# Create the 3D scatter plot with color based on 'Sentiment'
scatter = ax.scatter(df['threedvec_1'], df['threedvec_2'], df['threedvec_3'],
                     c=df['Sentiment'].cat.codes, cmap=plt.cm.get_cmap('tab10', len(df['Sentiment'].cat.categories)))

# Set labels and title
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
plt.title('Sentiment Visualization (3D)')

# Add a legend (optional)
handles, labels = scatter.legend_elements()
legend = ax.legend(handles, df['Sentiment'].cat.categories, loc="upper right", title="Sentiment")

plt.show()