This notebook looks at different preprocessing techniques and basic data clensing. 
Based on the cleaned data we have tried various methods to extract features.
We also have tried to test multiple ML/ANN models to test the accuracy on this data.

Next:
    We will look into this data again and try to clean it more so that we can achieve
     better accuracy, recall and precision.

     It is left for user to look into the data closely and crate better corpus.

## Data Preprocessing

In [None]:
## read data

utube_df_a = pd.read_csv('../Data/YoutubeCommentsDataSet.csv')
utube_df = utube_df_a[~utube_df_a.Comment.isnull()].reset_index(drop=True)
# utube_df.head()

In [None]:
## check for class imbalance
utube_df.Sentiment.value_counts(), utube_df_a.Sentiment.value_counts()

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer_snowball = PorterStemmer()

utube_corpus_snow = []
rejected_snow = []
for i in range(len(utube_df)):
    try:
        sentance = re.sub("[^a-zA-z]", " ", utube_df['Comment'][i]).lower().split()
        sentance = [stemmer_snowball.stem(word) for word in sentance if word not in stopwords.words('english')]
        sentance = ' '.join(sentance)
        utube_corpus_snow.append(sentance)
    except:
        print(i)
        rejected_snow.append(i)

In [None]:
### use lemmatizer for standardizing the words across
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

utube_corpus = []
rejected = []
for i in range(len(utube_df)):
    try:
        sentance = re.sub("[^a-zA-z]", " ", utube_df['Comment'][i]).lower().split()
        sentance = [lemmatizer.lemmatize(word) for word in sentance if word not in stopwords.words('english')]
        sentance = ' '.join(sentance)
        utube_corpus.append(sentance)
    except:
        print(i)
        rejected.append(i)

## feature creation Using BOW 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000, ngram_range=(1,4))
X = cv.fit_transform(utube_corpus).toarray()
Y = utube_df['Sentiment'].astype('category').cat.codes

In [None]:
utube_df['Sentiment'].value_counts()

In [None]:
#### train, test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# from imblearn.combine import SMOTEENN
# from collections import Counter

# # Applying SMOTE + ENN
# smote_enn = SMOTEENN(random_state=42)
# X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# print("Class Distribution After SMOTEENN:", Counter(y_resampled))


In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Applying SMOTE + ENN
smote = SMOTE(random_state=42, sampling_strategy='not majority')
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("Class Distribution After SMOTEENN:", Counter(y_resampled))


### 1 - model multinominal logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model_logistic = LogisticRegression(solver='lbfgs', 
                                    multi_class='multinomial', max_iter=500)\
                                        .fit(X_train, y_train)
y_train_pred_prob = model_logistic.predict_proba(X_train)
y_test_pred_prob = model_logistic.predict_proba(X_test)

y_train_pred = model_logistic.predict(X_train)
y_test_pred = model_logistic.predict(X_test)

print(accuracy_score(y_train, y_train_pred), accuracy_score(y_test, y_test_pred))
print('\n')
print(classification_report(y_test, y_test_pred))
print('\n')
print(classification_report(y_train, y_train_pred))

### 2 - model Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_randomforest = RandomForestClassifier(class_weight='balanced', n_estimators=100, criterion='log_loss',
                                            random_state=0).fit(X_train, y_train)

In [None]:
y_train_pred_prob = model_randomforest.predict_proba(X_train)
y_test_pred_prob = model_randomforest.predict_proba(X_test)

y_train_pred = model_randomforest.predict(X_train)
y_test_pred = model_randomforest.predict(X_test)

print(accuracy_score(y_train, y_train_pred), accuracy_score(y_test, y_test_pred))
print('\n')
print(classification_report(y_test, y_test_pred))
print('\n')
print(classification_report(y_train, y_train_pred))

### 3 - model multinominal XGBoost

In [None]:
class_counts = np.bincount(Y)  # Get number of samples per class
total_samples = len(Y)  # Total number of samples

# Compute class-wise weights
class_weights = {i: (total_samples - class_counts[i]) / class_counts[i] for i in range(len(class_counts))}
max_weight = max(class_weights.values())
print("Class Weights:", class_weights)

# Assign sample weights based on class
sample_weights = np.array([class_weights[label] for label in y_train])

In [None]:
from xgboost import XGBClassifier

# XGBoost with class weight adjustment
xgb_model = XGBClassifier(objective='multi:softmax', num_class=len(class_counts), scale_pos_weight=max_weight)
xgb_model.fit(X_train, y_train)


In [None]:
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_test_pred))
print("\n\n")
print(classification_report(y_test, y_test_pred))
print("\n\n")
print(classification_report(y_train, y_train_pred))

### 4 - using back propogation with ANN

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop

In [None]:
### update data for keras
y_train_k = keras.utils.to_categorical(y_train, num_classes=3)
y_train_k_resampled = keras.utils.to_categorical(y_resampled, num_classes=3)
y_test_k = keras.utils.to_categorical(y_test, num_classes=3)

In [None]:
model_ann = Sequential()
model_ann.add(Dense(1000, activation='relu', input_shape=(X_train.shape[1],)))
model_ann.add(Dropout(0.1))
model_ann.add(Dense(200, activation='relu'))
model_ann.add(Dropout(0.1))
model_ann.add(Dense(3, activation='softmax'))
model_ann.summary()

In [None]:
model_ann.compile(optimizer=RMSprop(),
            loss='categorical_crossentropy',
            metrics=['accuracy'])

In [None]:
history = model_ann.fit(X_train, y_train_k, 
                    epochs=10, batch_size=32, 
                    validation_data=(X_test, y_test),
                    verbose=1)

In [None]:
epochs = range(1, len(history.history['accuracy']) + 1)
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Plot Accuracy
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(epochs, train_acc, 'bo-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r*-', label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training & Validation Accuracy')
plt.legend()

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(epochs, train_loss, 'bo-', label='Training Loss')
plt.plot(epochs, val_loss, 'r*-', label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()

plt.show()

In [None]:
y_train_pred = model_ann.predict(X_train)
y_train_pred = np.argmax(y_train_pred, axis=1) 
y_test_pred = model_ann.predict(X_test)
y_test_pred = np.argmax(y_test_pred, axis=1)

y_test_labels = np.argmax(y_test_k, axis=1)
y_train_labels = np.argmax(y_train_k, axis=1)

print(accuracy_score(y_test, y_test_pred))
print("\n\n")
print(classification_report(y_test_labels, y_test_pred))
print("\n\n")
print(classification_report(y_train_labels, y_train_pred))

## feature creation using Word2Vec model

In [None]:
#### first, lets train word2vec model using our data.
from gensim.models import Word2Vec

## break sentances in list of words.
sentances_to_word = [x.split() for x in utube_corpus]

# Train Word2Vec model
word2vec_model = Word2Vec(sentances_to_word, vector_size=100, window=5, min_count=1, workers=4)

# Save model (Optional)
word2vec_model.save("word2vec.model")


In [None]:
### feature creation
import gensim.downloader as api
from gensim.models import KeyedVectors

# Download and Load Pretrained Google News Word2Vec Model (300-Dimensional)
word2vec_model_google = api.load("word2vec-google-news-300")

# Check Word Vector for "learning"
# print(word2vec_model['learning'][:10])  # Print first 10 dimensions


In [None]:
def get_sentence_vector(sentence, model):
    words = sentence.split()  # Tokenize sentence
    word_vectors = [model[word] for word in words if word in model]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)


In [None]:
X = np.array([get_sentence_vector(sent, word2vec_model_google) for sent in utube_corpus])
Y = utube_df['Sentiment'].astype('category').cat.codes
X.shape, Y.shape

In [None]:
#### train, test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Applying SMOTE + ENN
smote = SMOTE(random_state=42, sampling_strategy='not majority')
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("Class Distribution After SMOTEENN:", Counter(y_resampled))


#### 1 - model using logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model_logistic = LogisticRegression(solver='lbfgs', 
                                    multi_class='multinomial', max_iter=500)\
                                        .fit(X_train, y_train)
y_train_pred_prob = model_logistic.predict_proba(X_train)
y_test_pred_prob = model_logistic.predict_proba(X_test)

y_train_pred = model_logistic.predict(X_train)
y_test_pred = model_logistic.predict(X_test)

print(accuracy_score(y_train, y_train_pred), accuracy_score(y_test, y_test_pred))
print('\n')
print(classification_report(y_test, y_test_pred))
print('\n')
print(classification_report(y_train, y_train_pred))

#### 2 - model using XGBoost

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop

In [None]:
### update data for keras
y_train_k = keras.utils.to_categorical(y_train, num_classes=3)
y_train_k_resampled = keras.utils.to_categorical(y_resampled, num_classes=3)
y_test_k = keras.utils.to_categorical(y_test, num_classes=3)

In [None]:
y_train_k.shape

In [None]:
model_ann = Sequential()
model_ann.add(Dense(300, activation='relu', input_shape=(300,)))
# model_ann.add(Dropout(0.1))
model_ann.add(Dense(50, activation='relu'))
# model_ann.add(Dropout(0.1))
model_ann.add(Dense(3, activation='softmax'))
model_ann.summary()

model_ann.compile(optimizer=RMSprop(),
            loss='categorical_crossentropy',
            metrics=['accuracy'])



In [None]:
### model train
history = model_ann.fit(X_train, y_train_k, 
                    epochs=10, batch_size=32, 
                    validation_data=(X_test, y_test_k),
                    verbose=1)

In [None]:
epochs = range(1, len(history.history['accuracy']) + 1)
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Plot Accuracy
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(epochs, train_acc, 'bo-', label='Training Accuracy')
plt.plot(epochs, val_acc, 'r*-', label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training & Validation Accuracy')
plt.legend()

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(epochs, train_loss, 'bo-', label='Training Loss')
plt.plot(epochs, val_loss, 'r*-', label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()

plt.show()

In [None]:
y_train_pred = model_ann.predict(X_train)
y_train_pred = np.argmax(y_train_pred, axis=1) 
y_test_pred = model_ann.predict(X_test)
y_test_pred = np.argmax(y_test_pred, axis=1)

y_test_labels = np.argmax(y_test_k, axis=1)
y_train_labels = np.argmax(y_train_k, axis=1)

print(accuracy_score(y_test, y_test_pred))
print("\n\n")
print(classification_report(y_test_labels, y_test_pred))
print("\n\n")
print(classification_report(y_train_labels, y_train_pred))