In [14]:
cd /content/drive/MyDrive/nlp-project/feb-eval

/content/drive/MyDrive/nlp-project/feb-eval


In [25]:
!ls

dataset.csv  redditMH_X_test   redditMH_y_test	 stats.gsheet
eval.ipynb   redditMH_X_train  redditMH_y_train


In [2]:
!pip install text-preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting text-preprocessing
  Downloading text_preprocessing-0.1.1-py2.py3-none-any.whl (9.6 kB)
Collecting names-dataset==2.1
  Downloading names_dataset-2.1.0-py3-none-any.whl (62.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyspellchecker
  Downloading pyspellchecker-0.7.1-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unittest-xml-reporting
  Downloading unittest_xml_reporting-3.2.0-py2.py3-none-any.whl (20 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[2K     [90m━━━

In [3]:
import argparse
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from tqdm import tqdm
from sklearn import metrics
from nltk.tokenize import word_tokenize

from text_preprocessing import preprocess_text
from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word, remove_stopword, remove_whitespace


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
def preProcess(sent):
  preprocess_functions = [to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word, remove_stopword]
  preprocessed_text = preprocess_text(sent, preprocess_functions)
  # tokens = word_tokenize(preprocessed_text)
  # sent = [word for word in tokens if not word in stopwords.words('english')]
  return preprocessed_text

In [5]:
preProcess("google@gmail.com is my website. my name is ash")

'website name ash'

In [6]:
preprocessReq = False

In [7]:
embedType = "hash"

In [8]:
def dataGen(embedding_type, trainData):
  if embedding_type == 'tfidf':
      print("initializing TF-IDF embeddings...")
      embed = TfidfVectorizer(max_features=1000, ngram_range=(1,3))
  elif embedding_type == 'count':
      print("initializing Count embeddings...")
      embed = CountVectorizer(max_features=1000, ngram_range=(1,3))
  elif embedding_type == 'hash':
      print("initializing Hashing embeddings...")
      embed = HashingVectorizer(n_features=1000)

  corpus = []  
  for sent in trainData:
      if preprocessReq:
          sent = preProcess(sent)
      corpus.append(sent)

  embed.fit(corpus)
  print("embeddings formation completed ...")

  return embed, embed.transform(corpus) 

In [9]:
def tranformTest(embed, testData):
  corpusTest = []
  for sent in testData:
      if preprocessReq:
          sent = preProcess(sent)
      corpusTest.append(sent)
  embedTest = embed.transform(corpusTest) 

  return embedTest

In [10]:
modelType = "lr"

In [16]:
data = pd.read_csv("dataset.csv")
data = data.sample(frac=1, random_state=0)
X = data["post"].to_list()
Y = data["subreddit"].to_list()

In [17]:
data.shape

(182575, 2)

In [18]:
print(f"Proportion of the positive class is {100*sum(Y)/len(Y)}")

Proportion of the positive class is 24.829248254142133


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.3, random_state=42)

In [26]:
import pickle

with open("redditMH_X_train.pkl", "wb") as f:
    pickle.dump(X_train, f)
with open("redditMH_X_test.pkl", "wb") as f:
    pickle.dump(X_test, f)
with open("redditMH_y_train.pkl", "wb") as f:
    pickle.dump(y_train, f)
with open("redditMH_y_test.pkl", "wb") as f:
    pickle.dump(y_test, f)

In [20]:
print("TRAIN set size is ", round(100*len(y_train)/(len(y_train) + len(y_test)), 4))

TRAIN set size is  69.9997


In [21]:
print(f"Proportion of the positive class in TRAIN set is {100*sum(y_train)/len(y_train)}")
print(f"Proportion of the positive class in TEST set is {100*sum(y_test)/len(y_test)}")

Proportion of the positive class in TRAIN set is 24.829032409508457
Proportion of the positive class in TEST set is 24.829751885052854


In [None]:
# datasetLen = len(X)
# trainLen = int(0.7*datasetLen)
# print("Dataset Loaded")

# X_train, X_test, y_train, y_test = X[:trainLen], X[trainLen:], Y[:trainLen], Y[trainLen:]

In [None]:
embed, X_train = dataGen(embedType, X_train)

X_test = tranformTest(embed, X_test)

initializing Hashing embeddings...
embeddings formation completed ...


In [None]:
modelDict = {
            "lr" : LogisticRegression(),
            "svc" : SVC(),
            "dt" : DecisionTreeClassifier(),
            "rf" : RandomForestClassifier(),
            "ada" : AdaBoostClassifier(),
            "grad" : GradientBoostingClassifier(),
            "xtra" : ExtraTreesClassifier(),
            "nb" : MultinomialNB(),
            "sgd" : SGDClassifier(),
            "mlp" : MLPClassifier()
        }

In [None]:
modelType = "svc"

In [None]:
model = modelDict[modelType]

In [None]:
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [None]:
X_train.shape, X_test.shape

((127802, 1000), (54773, 1000))

In [None]:
sum(preds)

12882

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96     41173
           1       0.89      0.90      0.89     13600

    accuracy                           0.95     54773
   macro avg       0.93      0.93      0.93     54773
weighted avg       0.95      0.95      0.95     54773



In [None]:
oup = classification_report(y_test, preds, output_dict = True)

precision = 100*round(oup['1']['precision'], 5)
recall = 100*round(oup['1']['recall'], 5)
f1_score = 100*round(oup['1']['f1-score'], 5)

print(precision, recall, f1_score)

88.557 89.912 89.229


In [None]:
from sklearn.metrics import classification_report
for k, m in modelDict.items():
  if k not in ["mlp"]:
    continue

  print(k) 
  model = m
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  oup = classification_report(y_test, preds, output_dict = True)

  precision = 100*round(oup['1']['precision'], 5)
  recall = 100*round(oup['1']['recall'], 5)
  f1_score = 100*round(oup['1']['f1-score'], 5)

  print(k, precision, recall, f1_score)

mlp
mlp 86.28 87.346 86.809
