# Machine Learning

# Bag of Words Extraction

In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load your dataset
twitter_df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/clean_twitter.csv')
#rename column
twitter_df.rename(columns={"joined_text":"text"},inplace=True)

X = twitter_df['text']
#encode for train
y,class_names = pd.factorize(twitter_df['label'])

# Splitting the data into 80-20 train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)



(31234,)
(7809,)
(31234,)
(7809,)


In [2]:
# Apply CountVectorizer to text data
count_vectorizer = CountVectorizer(max_features=5000) # max_features
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_counts, y_train)

# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_resampled, y_train_resampled)
svm_predictions = svm_model.predict(X_test_counts)

print("Support Vector Machine (SVM) Classification Report:")
print(classification_report(y_test, svm_predictions))

Support Vector Machine (SVM) Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      6812
           1       0.48      0.51      0.49       997

    accuracy                           0.87      7809
   macro avg       0.70      0.71      0.71      7809
weighted avg       0.87      0.87      0.87      7809



In [None]:
# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, C=1)
logreg_model.fit(X_train_resampled, y_train_resampled)
logreg_predictions = logreg_model.predict(X_test_counts)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, logreg_predictions))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      6812
           1       0.49      0.58      0.53       997

    accuracy                           0.87      7809
   macro avg       0.71      0.75      0.73      7809
weighted avg       0.88      0.87      0.87      7809



In [None]:
# Naive Bayes
naive_bayes_model = MultinomialNB(alpha=1.0)
naive_bayes_model.fit(X_train_resampled, y_train_resampled)
naive_bayes_predictions = naive_bayes_model.predict(X_test_counts)

print("Naive Bayes Classification Report:")
print(classification_report(y_test, naive_bayes_predictions))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91      6812
           1       0.43      0.62      0.51       997

    accuracy                           0.85      7809
   macro avg       0.69      0.75      0.71      7809
weighted avg       0.88      0.85      0.86      7809



## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply TF-IDF vectorization to text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [None]:
# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_resampled, y_train_resampled)
svm_predictions = svm_model.predict(X_test_tfidf)

print("Support Vector Machine (SVM) Classification Report:")
print(classification_report(y_test, svm_predictions))

Support Vector Machine (SVM) Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.85      0.90      6812
           1       0.42      0.77      0.55       997

    accuracy                           0.84      7809
   macro avg       0.69      0.81      0.72      7809
weighted avg       0.89      0.84      0.86      7809



In [None]:
# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, C=1)
logreg_model.fit(X_train_resampled, y_train_resampled)
logreg_predictions = logreg_model.predict(X_test_tfidf)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, logreg_predictions))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.84      0.90      6812
           1       0.42      0.79      0.55       997

    accuracy                           0.84      7809
   macro avg       0.69      0.82      0.72      7809
weighted avg       0.90      0.84      0.85      7809



In [None]:
# Naive Bayes
naive_bayes_model = MultinomialNB(alpha=1.0)
naive_bayes_model.fit(X_train_resampled, y_train_resampled)
naive_bayes_predictions = naive_bayes_model.predict(X_test_tfidf)

print("Naive Bayes Classification Report:")
print(classification_report(y_test, naive_bayes_predictions))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.86      0.90      6812
           1       0.40      0.64      0.49       997

    accuracy                           0.83      7809
   macro avg       0.67      0.75      0.70      7809
weighted avg       0.87      0.83      0.85      7809

