In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
projectFolderPath = "/content/drive/MyDrive/Data Mining/"

trainDataPath = projectFolderPath + "preprocessed_train.csv"
testDataPath = projectFolderPath + "preprocessed_test.csv"

trainDF = pd.read_csv(trainDataPath, on_bad_lines = 'skip')
trainDF.head()

Unnamed: 0,text,label,preprocessed_text
0,Do you want to hear a story? So I was at work ...,0,want hear stori work caus scientist friend ar...
1,The author does a really good job with explain...,0,author realli good job explain great detail tz...
2,I think that you guys should all become a Seag...,0,think guy becom seago cowboy adventur world ri...
3,Why should the people Of the United States vot...,0,peopl unit state vote elector vote jolt one co...
4,"Senator, I believe that voting for the Preside...",0,senat believ vote presid unit state base popu...


In [5]:
testDF = pd.read_csv(testDataPath, on_bad_lines = 'skip')
testDF.head()

Unnamed: 0,text,label,preprocessed_text
0,The Face on Mars is nothing but a natural occu...,0,face mar noth natur occurr mar thing call ba...
1,Students have a higher chance of catching a vi...,0,student higher chanc catch viru school hour v...
2,Driverless cars have good and bad things that ...,0,driverless car good bad thing would go along ...
3,Some people might think that traveling in a gr...,1,peopl might think travel group led tour guid g...
4,How many of us students want to be forced to d...,0,mani us student want forc someth probabl mani...


In [6]:
def generateCVEmbeddings(corpus, max_features):
  vectorizer = CountVectorizer()
  model = vectorizer.fit_transform(corpus)

  inverse_model = vectorizer.inverse_transform(model)

  inverse_model_count = list(map(lambda doc_vec: len(doc_vec), inverse_model))

  inverse_model_count_tuples = list(zip(range(len(inverse_model_count)),
                                        inverse_model_count))

  top_documents_tuples = sorted(inverse_model_count_tuples,
                                key=lambda item: item[1],
                                reverse=True)[:max_features]

  top_documents, _ = zip(*top_documents_tuples)

  return model[top_documents]

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features = 2500, vocabulary = None)

preprocessed_text_list = list(trainDF['preprocessed_text'])
labelList = list(trainDF['label'])
npNanRows = list()
for i in range(len(preprocessed_text_list)):
  if preprocessed_text_list[i] is np.nan:
    npNanRows.append(i)

newPreProcessedText = list()
newLabels = list()

for i in range(len(preprocessed_text_list)):
  if not i in npNanRows:
    newPreProcessedText.append(preprocessed_text_list[i])
    newLabels.append(labelList[i])

trainDF = pd.DataFrame()
trainDF['preprocessed_text'] = newPreProcessedText
trainDF['label'] = newLabels

print(trainDF.head())

X_train_bow = count_vectorizer.fit_transform(trainDF['preprocessed_text'])
X_test_bow = count_vectorizer.fit_transform(testDF['preprocessed_text'])

X_train_bow.shape

                                   preprocessed_text  label
0  want hear stori  work caus scientist friend ar...      0
1  author realli good job explain great detail tz...      0
2  think guy becom seago cowboy adventur world ri...      0
3  peopl unit state vote elector vote jolt one co...      0
4  senat  believ vote presid unit state base popu...      0


(249645, 2500)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_features = 2500, vocabulary = None)

X_train_cv = count_vectorizer.fit_transform(trainDF['preprocessed_text'])
X_test_cv = count_vectorizer.transform(testDF['preprocessed_text'])

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features = 2500, vocabulary = None)

X_train_tfidf = tfidf_vectorizer.fit_transform(trainDF['preprocessed_text'])
X_test_tfidf = tfidf_vectorizer.transform(testDF['preprocessed_text'])

In [10]:
y_train = trainDF['label']
y_test = testDF["label"]

In [11]:
x_cv_train, x_cv_val, y_cv_train, y_cv_val = train_test_split(X_train_cv, y_train, test_size=0.2, random_state=42)

In [12]:
x_tfidf_train, x_tfidf_val, y_tfidf_train, y_tfidf_val = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

In [13]:
# XGBoost
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

xgb_model_cv = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model_tf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [14]:
xgb_model_cv.fit(x_cv_train, y_cv_train)

y_pred_count = xgb_model_cv.predict(X_test_cv)

accuracy_count = accuracy_score(y_test, y_pred_count)
print(f"Accuracy with CountVectorizer: {accuracy_count}")
print(classification_report(y_test, y_pred_count))


Accuracy with CountVectorizer: 0.9895134373520275
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     55845
           1       0.99      0.98      0.99     30742

    accuracy                           0.99     86587
   macro avg       0.99      0.99      0.99     86587
weighted avg       0.99      0.99      0.99     86587



In [15]:
xgb_model_cv.fit(x_tfidf_train, y_tfidf_train)

y_pred_count = xgb_model_cv.predict(X_test_cv)

accuracy_count = accuracy_score(y_test, y_pred_count)
print(f"Accuracy with TF-IDF: {accuracy_count}")
print(classification_report(y_test, y_pred_count))


Accuracy with TF-IDF: 0.8327347061337153
              precision    recall  f1-score   support

           0       0.99      0.75      0.85     55845
           1       0.68      0.99      0.81     30742

    accuracy                           0.83     86587
   macro avg       0.84      0.87      0.83     86587
weighted avg       0.88      0.83      0.84     86587

