In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re
import demoji
from sklearn.model_selection import train_test_split, GridSearchCV
pd.set_option('display.max_colwidth', 50)

%matplotlib inline

In [2]:
df = pd.read_csv('./comments_and_tags.csv', sep = ',', index_col = 'Unnamed: 0')
# make the 3 class problem into a binary problem
df['sentiment_binary'] = df['sentiment'].apply(lambda x: 1 if x == 1 else 0 )
df=df.reset_index()
df.head()

Unnamed: 0,index,id,sentiment,search_term,tagger,author,body,created_utc,link_id,permalink,score,subreddit,post_name,filename,sentiment_binary
0,0,glngetr,0,ladvc3,SUPRATIK,nevabraun,Thanks but you’ve lost me at \n\n„If look at A...,1612214426,t3_ladvc3,/r/wallstreetbets/comments/ladvc3/just_bought_...,21,wallstreetbets,"Just bought 860 of AMC, greetings from Germany...",comments_batch_6.csv,0
1,1,glndurd,1,ladvc3,SUPRATIK,Awake_4E,Awesome 😎! Why the moon 🤔 Let’s shoot AMC out ...,1612213448,t3_ladvc3,/r/wallstreetbets/comments/ladvc3/just_bought_...,25,wallstreetbets,"Just bought 860 of AMC, greetings from Germany...",comments_batch_6.csv,1
2,2,glndfos,1,ladvc3,SUPRATIK,Cloud9forreal,If you look at AMCs business page you’ll find ...,1612213289,t3_ladvc3,/r/wallstreetbets/comments/ladvc3/just_bought_...,58,wallstreetbets,"Just bought 860 of AMC, greetings from Germany...",comments_batch_6.csv,1
3,3,glnd58d,1,ladvc3,SUPRATIK,MacCoy69,"I bought 20 today, i am also a retarded dumb m...",1612213186,t3_ladvc3,/r/wallstreetbets/comments/ladvc3/just_bought_...,13,wallstreetbets,"Just bought 860 of AMC, greetings from Germany...",comments_batch_6.csv,1
4,4,glnd2xt,0,ladvc3,SUPRATIK,Menuler,"Ahh, my fellow Retard. I see the more and more...",1612213163,t3_ladvc3,/r/wallstreetbets/comments/ladvc3/just_bought_...,11,wallstreetbets,"Just bought 860 of AMC, greetings from Germany...",comments_batch_6.csv,0


In [3]:
def find_pos(word):
    # Part of Speech constants
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    pos= nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    # Adjective tags -'JJ', 'JJR', 'JJS'
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags -'RB', 'RBR', 'RBS'
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags -'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    elif pos.lower()[0] == 'v':
        return 'v'
    # Noun tags -'NN', 'NNS', 'NNP', 'NNPS'
    else:
        return 'n'

demoji.download_codes()

def remove_emoji(text):
    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item,'')
        
    return text

# Function to apply lemmatization to a list of words
def words_lemmatizer(text, encoding="utf8"):
    words = nltk.word_tokenize(text)
    lemma_words = []
    wl= WordNetLemmatizer()
    for word in words:
        pos= find_pos(word)
        lemma_words.append(wl.lemmatize(word, pos))
    return " ".join(lemma_words)

def remove_stopwords(text, lang='english'):
    new_words = ['already','also','comment','delete','even','literally','lolol','lololol','lolz','lols','lot','loll','lolololol','lolll','mean','na',
                 'point','post','probably','put','reddit','remove','see','something','want','well'] #first 200 in dictionary_all #insert all additional stopwords you want to remove here 
    custom = nltk.corpus.stopwords.words('english')
    custom.extend(new_words)
    words = nltk.word_tokenize(text)
    lang_stopwords = stopwords.words(lang)
    stopwords_removed = [w for w in words if w.lower() not in lang_stopwords]
    removing_custom_words = [words for words in stopwords_removed if not words in custom]
    return " ".join(removing_custom_words)

def do_prepocessing(one_row):
    lower_text = one_row.lower()
    
    remove_url = re.sub(r'http\S+', '',lower_text) # Remove URL
    
    remove_emoji_text = remove_emoji(remove_url) # Remove emojis

    remove_unwanted_charectors = re.sub(r'[^a-zA-Z0-9_#@&\s]', ' ', remove_emoji_text) # Remove unwanted charectors like punctuations andnon ascii 
    remove_unwanted_charectors = re.sub(r'&[\w]+', ' ', remove_unwanted_charectors) # Remove &amp, *&words etc
    
    removed_extra_space = re.sub(r'\s+',' ', remove_unwanted_charectors) # Remove extra white_spaces
    
    extract_hash = re.findall(r'#[\w]+', removed_extra_space) # Extract #hashTags
    extract_has_joined = " ".join(extract_hash)
    removed_hash_text = re.sub(r'#[\w]+', '', removed_extra_space) # Remove #hastags
    
    remove_atrate = re.findall(r'@[\w]+', removed_hash_text) # Extract @Users
    removed_atrate = re.sub(r'@[\w]+', '', removed_hash_text) # Remove @Users
    
    removed_stopwords_text = remove_stopwords(removed_atrate)
    lemmatize_text = words_lemmatizer(removed_stopwords_text)
    return lemmatize_text

  demoji.download_codes()


In [4]:
# Preprocessing

df['body'] = df['body'].astype(str)
df['body'] = df['body'].apply(do_prepocessing)

In [5]:
from nltk.corpus import stopwords

top_n = 7000

#create new df with only body and sentiment
dfNew = df[['body','sentiment_binary']]

# create sparse matrix with 1-gram and 2-gram
stop = set(stopwords.words('english'))
corpus = dfNew.loc[:,'body']
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words=list(stop))
vectorized = tfidf.fit_transform(corpus)

#convert to dense matrix
vocab = tfidf.get_feature_names()
df_vectorized= pd.DataFrame(vectorized.todense(),columns=vocab)

#top k features
indices = np.argsort(tfidf.idf_)[::-1]
features = tfidf.get_feature_names()
top_features = [features[i] for i in indices[:top_n]]

#df with only top features
df_top_vectorized = df_vectorized[top_features]

# combine with dfNew
df_combined = pd.concat([dfNew, df_top_vectorized], axis=1)
df_combined

Unnamed: 0,body,sentiment_binary,zoppity,fomo day,fomo bought,fomo begin,fomo bag,fomo across,fomo absolute,follower subreddit,...,lie go,lie floor,lie fake,lie call,lick paint,lick butthole,lfg obligatory,liability legal,li nio,li
0,thanks lose look amcs business page,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,awesome moon let shoot amc solar system,1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,look amcs business page find actually business...,1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bought 20 today retard dumb money ape germany ...,1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ahh fellow retard people get pessimistic gme a...,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5484,hold 200 bb share hit moon,1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5485,b ippity b oppity give zoppity,0,0.396874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5486,lol really force bb thing eh,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5487,,1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
X = df_combined.drop(['body','sentiment_binary'],axis=1)
y = df_combined['sentiment_binary']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric = None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)
# this is the score for top 7,000 terms

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [12:27<00:00, 25.77s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
DummyClassifier                    0.50               0.50     0.50      0.50   
AdaBoostClassifier                 0.53               0.50     0.50      0.37   
LogisticRegression                 0.53               0.50     0.50      0.37   
XGBClassifier                      0.53               0.50     0.50      0.37   
SVC                                0.53               0.50     0.50      0.37   
SGDClassifier                      0.53               0.50     0.50      0.37   
RidgeClassifierCV                  0.53               0.50     0.50      0.37   
RidgeClassifier                    0.53               0.50     0.50      0.37   
RandomForestClassifier             0.53               0.50     0.50      0.37   
QuadraticDiscriminantAnalysis      0.47               0.50     0.50      0.30   
Perceptron                  


